Merge WITH CONFLICTS 2025-03-11 main commit '7c462b3417ecd3ae3907f3480f3b8a8c99fc6d7b' into yuchen/dire

ct-io-delta-image-layer-write Conflicts: pageserver/src/tenant/blob_io.rs
2026-05-28 10:30:40 +00:00 · 2025-04-09 19:39:12 +02:00
parent 537eb334f2 7c462b3417
commit f078d7e1a9
380 changed files with 13238 additions and 5308 deletions
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,8 +33,9 @@ use utils::lsn::Lsn;

 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
-use crate::tenant::Timeline;
 use crate::tenant::storage_layer::IoConcurrency;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};

 #[derive(Debug, thiserror::Error)]
 pub enum BasebackupError {
@@ -42,6 +43,26 @@ pub enum BasebackupError {
    Server(#[from] anyhow::Error),
    #[error("basebackup client error {0:#} when {1}")]
    Client(#[source] io::Error, &'static str),
+    #[error("basebackup during shutdown")]
+    Shutdown,
+}
+
+impl From<PageReconstructError> for BasebackupError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
+}
+
+impl From<GetVectoredError> for BasebackupError {
+    fn from(value: GetVectoredError) -> Self {
+        match value {
+            GetVectoredError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
 }

 /// Create basebackup with non-rel data in it.
@@ -127,7 +148,7 @@ where
            timeline
                .gate
                .enter()
-                .map_err(|e| BasebackupError::Server(e.into()))?,
+                .map_err(|_| BasebackupError::Shutdown)?,
        ),
    };
    basebackup
@@ -323,8 +344,7 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
+                .await?
                .partition(
                    self.timeline.get_shard_identity(),
                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -336,11 +356,10 @@ where
                let blocks = self
                    .timeline
                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;

                for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    let block = block?;
                    slru_builder.add_block(&key, block).await?;
                }
            }
@@ -349,11 +368,8 @@ where

        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;

@@ -362,8 +378,7 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -391,8 +406,7 @@ where
        let aux_files = self
            .timeline
            .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
        let aux_scan_time = start_time.elapsed();
        let aux_estimated_size = aux_files
            .values()
@@ -451,16 +465,14 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
        {
            self.add_twophase_file(xid).await?;
        }
        let repl_origins = self
            .timeline
            .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
        let n_origins = repl_origins.len();
        if n_origins != 0 {
            //
@@ -505,8 +517,7 @@ where
        let nblocks = self
            .timeline
            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        // If the relation is empty, create an empty file
        if nblocks == 0 {
@@ -532,8 +543,7 @@ where
                    // TODO: investigate using get_vectored for the entire startblk..endblk range.
                    // But this code path is not on the critical path for most basebackups (?).
                    .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }

@@ -567,8 +577,7 @@ where
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;

            if img.len()
                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -622,8 +631,7 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                    .is_empty()
            {
                return Ok(());
@@ -674,8 +682,7 @@ where
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,6 +14,7 @@ use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
+use nix::sys::socket::{setsockopt, sockopt};
 use pageserver::config::{PageServerConf, PageserverIdentity};
 use pageserver::controller_upcall_client::ControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
@@ -24,11 +25,12 @@ use pageserver::task_mgr::{
 };
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service,
-    task_mgr, virtual_file,
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
+    page_cache, page_service, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
+use rustls_pki_types::{CertificateDer, PrivateKeyDer};
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
@@ -342,11 +344,25 @@ fn start_pageserver(
    info!("Starting pageserver http handler on {http_addr}");
    let http_listener = tcp_listener::bind(http_addr)?;

-    let pg_addr = &conf.listen_pg_addr;
+    let https_listener = match conf.listen_https_addr.as_ref() {
+        Some(https_addr) => {
+            info!("Starting pageserver https handler on {https_addr}");
+            Some(tcp_listener::bind(https_addr)?)
+        }
+        None => None,
+    };

+    let pg_addr = &conf.listen_pg_addr;
    info!("Starting pageserver pg protocol handler on {pg_addr}");
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

+    // Enable SO_KEEPALIVE on the socket, to detect dead connections faster.
+    // These are configured via net.ipv4.tcp_keepalive_* sysctls.
+    //
+    // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't
+    // support enabling keepalives while using the default OS sysctls.
+    setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?;
+
    // Launch broker client
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME
@@ -567,9 +583,8 @@ fn start_pageserver(

    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
-    let http_endpoint_listener = {
+    let (http_endpoint_listener, https_endpoint_listener) = {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
-        let cancel = CancellationToken::new();

        let router_state = Arc::new(
            http::routes::State::new(
@@ -584,22 +599,51 @@ fn start_pageserver(
            )
            .context("Failed to initialize router state")?,
        );
+
        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
            .build()
            .map_err(|err| anyhow!(err))?;
-        let service = http_utils::RouterService::new(router).unwrap();
-        let server = hyper0::Server::from_tcp(http_listener)?
-            .serve(service)
-            .with_graceful_shutdown({
-                let cancel = cancel.clone();
-                async move { cancel.clone().cancelled().await }
-            });

-        let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "http endpoint listener",
-            server,
-        ));
-        HttpEndpointListener(CancellableTask { task, cancel })
+        let service =
+            Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?);
+
+        let http_task = {
+            let server =
+                http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?;
+            let cancel = CancellationToken::new();
+
+            let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+                "http endpoint listener",
+                server.serve(cancel.clone()),
+            ));
+            HttpEndpointListener(CancellableTask { task, cancel })
+        };
+
+        let https_task = match https_listener {
+            Some(https_listener) => {
+                let certs = load_certs(&conf.ssl_cert_file)?;
+                let key = load_private_key(&conf.ssl_key_file)?;
+
+                let server_config = rustls::ServerConfig::builder()
+                    .with_no_client_auth()
+                    .with_single_cert(certs, key)?;
+
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+
+                let server =
+                    http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
+                let cancel = CancellationToken::new();
+
+                let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+                    "https endpoint listener",
+                    server.serve(cancel.clone()),
+                ));
+                Some(HttpsEndpointListener(CancellableTask { task, cancel }))
+            }
+            None => None,
+        };
+
+        (http_task, https_task)
    };

    let consumption_metrics_tasks = {
@@ -675,6 +719,7 @@ fn start_pageserver(
        shutdown_pageserver.cancel();
        pageserver::shutdown_pageserver(
            http_endpoint_listener,
+            https_endpoint_listener,
            page_service,
            consumption_metrics_tasks,
            disk_usage_eviction_task,
@@ -689,6 +734,25 @@ fn start_pageserver(
    })
 }

+fn load_certs(filename: &Utf8Path) -> std::io::Result<Vec<CertificateDer<'static>>> {
+    let file = std::fs::File::open(filename)?;
+    let mut reader = std::io::BufReader::new(file);
+
+    rustls_pemfile::certs(&mut reader).collect()
+}
+
+fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
+    let file = std::fs::File::open(filename)?;
+    let mut reader = std::io::BufReader::new(file);
+
+    let key = rustls_pemfile::private_key(&mut reader)?;
+
+    key.ok_or(anyhow::anyhow!(
+        "no private key found in {}",
+        filename.as_str(),
+    ))
+}
+
 async fn create_remote_storage_client(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -53,6 +53,11 @@ pub struct PageServerConf {
    pub listen_pg_addr: String,
    /// Example (default): 127.0.0.1:9898
    pub listen_http_addr: String,
+    /// Example: 127.0.0.1:9899
+    pub listen_https_addr: Option<String>,
+
+    pub ssl_key_file: Utf8PathBuf,
+    pub ssl_cert_file: Utf8PathBuf,

    /// Current availability zone. Used for traffic metrics.
    pub availability_zone: Option<String>,
@@ -194,6 +199,13 @@ pub struct PageServerConf {
    /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
    /// safekeepers does not have gaps.
    pub validate_wal_contiguity: bool,
+
+    /// When set, the previously written to disk heatmap is loaded on tenant attach and used
+    /// to avoid clobbering the heatmap from new, cold, attached locations.
+    pub load_previous_heatmap: bool,
+
+    /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
+    pub generate_unarchival_heatmap: bool,
 }

 /// Token for authentication to safekeepers
@@ -310,6 +322,9 @@ impl PageServerConf {
        let pageserver_api::config::ConfigToml {
            listen_pg_addr,
            listen_http_addr,
+            listen_https_addr,
+            ssl_key_file,
+            ssl_cert_file,
            availability_zone,
            wait_lsn_timeout,
            wal_redo_timeout,
@@ -358,6 +373,8 @@ impl PageServerConf {
            get_vectored_concurrent_io,
            enable_read_path_debugging,
            validate_wal_contiguity,
+            load_previous_heatmap,
+            generate_unarchival_heatmap,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -366,6 +383,9 @@ impl PageServerConf {
            // ------------------------------------------------------------
            listen_pg_addr,
            listen_http_addr,
+            listen_https_addr,
+            ssl_key_file,
+            ssl_cert_file,
            availability_zone,
            wait_lsn_timeout,
            wal_redo_timeout,
@@ -447,6 +467,8 @@ impl PageServerConf {
            no_sync: no_sync.unwrap_or(false),
            enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
            validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
+            load_previous_heatmap: load_previous_heatmap.unwrap_or(true),
+            generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true),
        };

        // ------------------------------------------------------------
@@ -480,7 +502,9 @@ impl PageServerConf {
    #[cfg(test)]
    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
        let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
-        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
+
+        let test_id = uuid::Uuid::new_v4();
+        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
    }

    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -493,6 +517,8 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            synthetic_size_calculation_interval: Duration::from_secs(60),
            background_task_maximum_delay: Duration::ZERO,
+            load_previous_heatmap: Some(true),
+            generate_unarchival_heatmap: Some(true),
            ..Default::default()
        };
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -89,16 +89,112 @@
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.

-use crate::task_mgr::TaskKind;
+use std::sync::Arc;
+
+use once_cell::sync::Lazy;
+use tracing::warn;
+use utils::{id::TimelineId, shard::TenantShardId};
+
+use crate::{
+    metrics::{StorageIoSizeMetrics, TimelineMetrics},
+    task_mgr::TaskKind,
+    tenant::Timeline,
+};

 // The main structure of this module, see module-level comment.
-#[derive(Debug)]
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
    page_content_kind: PageContentKind,
    read_path_debug: bool,
+    scope: Scope,
+}
+
+#[derive(Clone)]
+pub(crate) enum Scope {
+    Global {
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+    },
+    SecondaryTenant {
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+    },
+    SecondaryTimeline {
+        io_size_metrics: crate::metrics::StorageIoSizeMetrics,
+    },
+    Timeline {
+        // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
+        // context creation contending for the ref counters of the Arc<TimelineMetrics>,
+        // which are shared among all tasks that operate on the timeline, especially
+        // concurrent page_service connections.
+        #[allow(clippy::redundant_allocation)]
+        arc_arc: Arc<Arc<TimelineMetrics>>,
+    },
+    #[cfg(test)]
+    UnitTest {
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+    },
+}
+
+static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
+    Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));
+
+impl Scope {
+    pub(crate) fn new_global() -> Self {
+        Scope::Global {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+        }
+    }
+    /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
+    /// of a compaction iteration.
+    pub(crate) fn new_timeline(timeline: &Timeline) -> Self {
+        Scope::Timeline {
+            arc_arc: Arc::new(Arc::clone(&timeline.metrics)),
+        }
+    }
+    pub(crate) fn new_page_service_pagestream(
+        timeline_handle: &crate::tenant::timeline::handle::Handle<
+            crate::page_service::TenantManagerTypes,
+        >,
+    ) -> Self {
+        Scope::Timeline {
+            arc_arc: Arc::clone(&timeline_handle.metrics),
+        }
+    }
+    pub(crate) fn new_secondary_timeline(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Self {
+        // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.
+
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = tenant_shard_id.shard_slug().to_string();
+        let timeline_id = timeline_id.to_string();
+
+        let io_size_metrics =
+            crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
+        Scope::SecondaryTimeline { io_size_metrics }
+    }
+    pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
+        // Before propagating metrics via RequestContext, the labels were inferred from file path.
+        // The only user of VirtualFile at tenant scope is the heatmap download & read.
+        // The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*).
+        // Thus, we do the same here, and extend that for anything secondary-tenant scoped.
+        //
+        // If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future,
+        // we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown,
+        // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
+        // at this point, so, we were able to completely side-step tenant-scoped stuff there).
+        Scope::SecondaryTenant {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+        }
+    }
+    #[cfg(test)]
+    pub(crate) fn new_unit_test() -> Self {
+        Scope::UnitTest {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+        }
+    }
 }

 /// The kind of access to the page cache.
@@ -157,6 +253,7 @@ impl RequestContextBuilder {
                access_stats_behavior: AccessStatsBehavior::Update,
                page_content_kind: PageContentKind::Unknown,
                read_path_debug: false,
+                scope: Scope::new_global(),
            },
        }
    }
@@ -171,10 +268,16 @@ impl RequestContextBuilder {
                access_stats_behavior: original.access_stats_behavior,
                page_content_kind: original.page_content_kind,
                read_path_debug: original.read_path_debug,
+                scope: original.scope.clone(),
            },
        }
    }

+    pub fn task_kind(mut self, k: TaskKind) -> Self {
+        self.inner.task_kind = k;
+        self
+    }
+
    /// Configure the DownloadBehavior of the context: whether to
    /// download missing layers, and/or warn on the download.
    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
@@ -199,6 +302,11 @@ impl RequestContextBuilder {
        self
    }

+    pub(crate) fn scope(mut self, s: Scope) -> Self {
+        self.inner.scope = s;
+        self
+    }
+
    pub fn build(self) -> RequestContext {
        self.inner
    }
@@ -281,7 +389,50 @@ impl RequestContext {
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        Self::new(task_kind, download_behavior)
+        RequestContextBuilder::extend(self)
+            .task_kind(task_kind)
+            .download_behavior(download_behavior)
+            .build()
+    }
+
+    pub fn with_scope_timeline(&self, timeline: &Arc<Timeline>) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_timeline(timeline))
+            .build()
+    }
+
+    pub(crate) fn with_scope_page_service_pagestream(
+        &self,
+        timeline_handle: &crate::tenant::timeline::handle::Handle<
+            crate::page_service::TenantManagerTypes,
+        >,
+    ) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_page_service_pagestream(timeline_handle))
+            .build()
+    }
+
+    pub fn with_scope_secondary_timeline(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id))
+            .build()
+    }
+
+    pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_secondary_tenant(tenant_shard_id))
+            .build()
+    }
+
+    #[cfg(test)]
+    pub fn with_scope_unit_test(&self) -> Self {
+        RequestContextBuilder::new(TaskKind::UnitTest)
+            .scope(Scope::new_unit_test())
+            .build()
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -303,4 +454,38 @@ impl RequestContext {
    pub(crate) fn read_path_debug(&self) -> bool {
        self.read_path_debug
    }
+
+    pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
+        match &self.scope {
+            Scope::Global { io_size_metrics } => {
+                let is_unit_test = cfg!(test);
+                let is_regress_test_build = cfg!(feature = "testing");
+                if is_unit_test || is_regress_test_build {
+                    panic!("all VirtualFile instances are timeline-scoped");
+                } else {
+                    use once_cell::sync::Lazy;
+                    use std::sync::Mutex;
+                    use std::time::Duration;
+                    use utils::rate_limit::RateLimit;
+                    static LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
+                    let mut guard = LIMIT.lock().unwrap();
+                    guard.call2(|rate_limit_stats| {
+                        warn!(
+                            %rate_limit_stats,
+                            backtrace=%std::backtrace::Backtrace::force_capture(),
+                            "all VirtualFile instances are timeline-scoped",
+                        );
+                    });
+
+                    io_size_metrics
+                }
+            }
+            Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
+            Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
+            Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
+            #[cfg(test)]
+            Scope::UnitTest { io_size_metrics } => io_size_metrics,
+        }
+    }
 }
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -84,6 +84,7 @@ impl ControllerUpcallClient {
        })
    }

+    #[tracing::instrument(skip_all)]
    async fn retry_http_forever<R, T>(
        &self,
        url: &url::Url,
@@ -108,7 +109,7 @@ impl ControllerUpcallClient {
            |_| false,
            3,
            u32::MAX,
-            "calling control plane generation validation API",
+            "storage controller upcall",
            &self.cancel,
        )
        .await
@@ -125,11 +126,12 @@ impl ControllerUpcallClient {

 impl ControlPlaneGenerationsApi for ControllerUpcallClient {
    /// Block until we get a successful response, or error out if we are shut down
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
    async fn re_attach(
        &self,
        conf: &PageServerConf,
    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
-        let re_attach_path = self
+        let url = self
            .base_url
            .join("re-attach")
            .expect("Failed to build re-attach path");
@@ -179,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
-                        listen_https_port: None, // TODO: Support https.
+                        listen_https_port: m.https_port,
                        availability_zone_id: az_id.expect("Checked above"),
                    })
                }
@@ -205,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
            register: register.clone(),
        };

-        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+        let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants (node {}, register: {:?})",
            response.tenants.len(),
@@ -223,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
    }

    /// Block until we get a successful response, or error out if we are shut down
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
    async fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
-        let re_attach_path = self
+        let url = self
            .base_url
            .join("validate")
            .expect("Failed to build validate path");
@@ -257,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                return Err(RetryForeverError::ShuttingDown);
            }

-            let response: ValidateResponse =
-                self.retry_http_forever(&re_attach_path, request).await?;
+            let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
            for rt in response.tenants {
                result.insert(rt.id, rt.valid);
            }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -842,6 +842,12 @@ paths:
        required: false
        schema:
          type: integer
+      - name: recurse
+        description: When set, will recurse with the downloads into ancestor timelines
+        in: query
+        required: false
+        schema:
+          type: boolean
    post:
      description: |
        Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -37,7 +37,8 @@ use pageserver_api::models::{
    TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
    TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
    TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
-    TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
+    TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
+    TopTenantShardsRequest, TopTenantShardsResponse,
 };
 use pageserver_api::shard::{ShardCount, TenantShardId};
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
@@ -54,6 +55,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use crate::config::PageServerConf;
+use crate::context;
 use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -63,6 +65,7 @@ use crate::tenant::mgr::{
    GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
 };
+use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::{
    download_index_part, list_remote_tenant_shards, list_remote_timelines,
 };
@@ -481,6 +484,7 @@ async fn build_timeline_info_common(

        state,
        is_archived: Some(is_archived),
+        rel_size_migration: Some(timeline.get_rel_size_v2_status()),

        walreceiver_status,
    };
@@ -857,6 +861,75 @@ async fn timeline_archival_config_handler(
    json_response(StatusCode::OK, ())
 }

+/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
+/// measure only.
+///
+/// Some examples of safe patches:
+/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
+/// - Force set the index part to use reldir v2 (migrating/migrated).
+///
+/// Some examples of unsafe patches:
+/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
+///   errors.
+/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
+async fn timeline_patch_index_part_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
+    check_permission(&request, None)?; // require global permission for this request
+    let state = get_state(&request);
+
+    async {
+        let timeline =
+            active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+                .await?;
+
+        if let Some(rel_size_migration) = request_data.rel_size_migration {
+            timeline
+                .update_rel_size_v2_status(rel_size_migration)
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(gc_compaction_last_completed_lsn) =
+            request_data.gc_compaction_last_completed_lsn
+        {
+            timeline
+                .update_gc_compaction_state(GcCompactionState {
+                    last_completed_lsn: gc_compaction_last_completed_lsn,
+                })
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
+            {
+                let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
+                guard.store_and_unlock(applied_gc_cutoff_lsn);
+            }
+        }
+
+        if request_data.force_index_update {
+            timeline
+                .remote_client
+                .force_schedule_index_upload()
+                .context("force schedule index upload")
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_patch_index_part",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -881,12 +954,13 @@ async fn timeline_detail_handler(
        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

        let timeline = tenant.get_timeline(timeline_id, false)?;
+        let ctx = &ctx.with_scope_timeline(&timeline);

        let timeline_info = build_timeline_info(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
            force_await_initial_logical_size.unwrap_or(false),
-            &ctx,
+            ctx,
        )
        .await
        .context("get local timeline info")
@@ -927,11 +1001,11 @@ async fn get_lsn_by_timestamp_handler(

    let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
@@ -1000,10 +1074,11 @@ async fn get_timestamp_of_lsn_handler(
        .with_context(|| format!("Invalid LSN: {lsn_str:?}"))
        .map_err(ApiError::BadRequest)?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
@@ -1358,7 +1433,8 @@ async fn timeline_layer_scan_disposable_keys(
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);

    let guard = timeline.layers.read().await;
    let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
@@ -1368,7 +1444,7 @@ async fn timeline_layer_scan_disposable_keys(
    };

    let resident_layer = layer
-        .download_and_keep_resident()
+        .download_and_keep_resident(&ctx)
        .await
        .map_err(|err| match err {
            tenant::storage_layer::layer::DownloadError::TimelineShutdown
@@ -1436,6 +1512,7 @@ async fn timeline_download_heatmap_layers_handler(

    let desired_concurrency =
        parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
+    let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false);

    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

@@ -1443,6 +1520,8 @@ async fn timeline_download_heatmap_layers_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);

    let max_concurrency = get_config(&request)
        .remote_storage_config
@@ -1451,7 +1530,7 @@ async fn timeline_download_heatmap_layers_handler(
        .unwrap_or(DEFAULT_MAX_CONCURRENCY);
    let concurrency = std::cmp::min(max_concurrency, desired_concurrency);

-    timeline.start_heatmap_layers_download(concurrency).await?;
+    timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?;

    json_response(StatusCode::ACCEPTED, ())
 }
@@ -1490,8 +1569,10 @@ async fn layer_download_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let downloaded = timeline
-        .download_layer(&layer_name)
+        .download_layer(&layer_name, &ctx)
        .await
        .map_err(|e| match e {
            tenant::storage_layer::layer::DownloadError::TimelineShutdown
@@ -2225,8 +2306,8 @@ async fn timeline_compact_handler(
        .unwrap_or(false);

    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
        if scheduled {
            let tenant = state
                .tenant_manager
@@ -2333,8 +2414,8 @@ async fn timeline_checkpoint_handler(
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
        if wait_until_flushed {
            timeline.freeze_and_flush().await
        } else {
@@ -2389,7 +2470,9 @@ async fn timeline_download_remote_layers_handler_post(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
-    match timeline.spawn_download_all_remote_layers(body).await {
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
+    match timeline.spawn_download_all_remote_layers(body, &ctx).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
    }
@@ -2471,6 +2554,7 @@ async fn timeline_detach_ancestor_handler(
        tracing::info!("all timeline upload queues are drained");

        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let ctx = &ctx.with_scope_timeline(&timeline);

        let progress = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
@@ -2577,8 +2661,9 @@ async fn getpage_at_lsn_handler_inner(
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        // Enable read path debugging
-        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true)
+        .scope(context::Scope::new_timeline(&timeline)).build();

        // Use last_record_lsn if no lsn is provided
        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
@@ -2612,8 +2697,8 @@ async fn timeline_collect_keyspace(
    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;

    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let (dense_ks, sparse_ks) = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -3250,7 +3335,7 @@ async fn put_tenant_timeline_import_basebackup(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        let timeline = tenant
+        let (timeline, timeline_ctx) = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .map_err(ApiError::InternalServerError)
            .await?;
@@ -3269,7 +3354,13 @@ async fn put_tenant_timeline_import_basebackup(
        info!("importing basebackup");

        timeline
-            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
+            .import_basebackup_from_tar(
+                tenant.clone(),
+                &mut body,
+                base_lsn,
+                broker_client,
+                &timeline_ctx,
+            )
            .await
            .map_err(ApiError::InternalServerError)?;

@@ -3309,6 +3400,7 @@ async fn put_tenant_timeline_import_wal(
        let state = get_state(&request);

        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
+        let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build();

        let mut body = StreamReader::new(request.into_body().map(|res| {
            res.map_err(|error| {
@@ -3625,6 +3717,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
+            |r| api_handler(r, timeline_patch_index_part_handler),
+        )
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
            |r| api_handler(r, lsn_lease_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -64,6 +64,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct HttpsEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,6 +78,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
+    https_listener: Option<HttpsEndpointListener>,
    page_service: page_service::Listener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
@@ -213,6 +215,15 @@ pub async fn shutdown_pageserver(
    )
    .await;

+    if let Some(https_listener) = https_listener {
+        timed(
+            https_listener.0.shutdown(),
+            "shutdown https",
+            Duration::from_secs(1),
+        )
+        .await;
+    }
+
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_batch_global",
+        "Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_amortized_global",
+        "Layers visited to serve a single read (read amplification). Amortized across a batch: \
+            all visited layers are divided by number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
    register_histogram!(
@@ -1204,11 +1227,24 @@ impl StorageIoTime {

 pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);

-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+#[derive(Clone, Copy)]
+#[repr(usize)]
+enum StorageIoSizeOperation {
+    Read,
+    Write,
+}
+
+impl StorageIoSizeOperation {
+    const VARIANTS: &'static [&'static str] = &["read", "write"];
+
+    fn as_str(&self) -> &'static str {
+        Self::VARIANTS[*self as usize]
+    }
+}

 // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
+static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
        &["operation", "tenant_id", "shard_id", "timeline_id"]
@@ -1216,6 +1252,34 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+#[derive(Clone, Debug)]
+pub(crate) struct StorageIoSizeMetrics {
+    pub read: UIntGauge,
+    pub write: UIntGauge,
+}
+
+impl StorageIoSizeMetrics {
+    pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
+        let read = STORAGE_IO_SIZE
+            .get_metric_with_label_values(&[
+                StorageIoSizeOperation::Read.as_str(),
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ])
+            .unwrap();
+        let write = STORAGE_IO_SIZE
+            .get_metric_with_label_values(&[
+                StorageIoSizeOperation::Write.as_str(),
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ])
+            .unwrap();
+        Self { read, write }
+    }
+}
+
 #[cfg(not(test))]
 pub(crate) mod virtual_file_descriptor_cache {
    use super::*;
@@ -2798,6 +2862,7 @@ pub(crate) struct TimelineMetrics {
    /// Number of valid LSN leases.
    pub valid_lsn_lease_count_gauge: UIntGauge,
    pub wal_records_received: IntCounter,
+    pub storage_io_size: StorageIoSizeMetrics,
    shutdown: std::sync::atomic::AtomicBool,
 }

@@ -2933,6 +2998,8 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

+        let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
+
        TimelineMetrics {
            tenant_id,
            shard_id,
@@ -2962,6 +3029,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            storage_io_size,
            valid_lsn_lease_count_gauge,
            wal_records_received,
            shutdown: std::sync::atomic::AtomicBool::default(),
@@ -3152,7 +3220,7 @@ impl TimelineMetrics {
            ]);
        }

-        for op in STORAGE_IO_SIZE_OPERATIONS {
+        for op in StorageIoSizeOperation::VARIANTS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

@@ -4074,6 +4142,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
    // histograms
    [
        &LAYERS_PER_READ_GLOBAL,
+        &LAYERS_PER_READ_BATCH_GLOBAL,
+        &LAYERS_PER_READ_AMORTIZED_GLOBAL,
        &DELTAS_PER_READ_GLOBAL,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -56,6 +56,7 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
+    TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -392,10 +393,6 @@ impl TimelineHandles {
            .await
            .map_err(|e| match e {
                timeline::handle::GetError::TenantManager(e) => e,
-                timeline::handle::GetError::TimelineGateClosed => {
-                    trace!("timeline gate closed");
-                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
-                }
                timeline::handle::GetError::PerTimelineStateShutDown => {
                    trace!("per-timeline state shut down");
                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
@@ -422,24 +419,36 @@ pub(crate) struct TenantManagerTypes;
 impl timeline::handle::Types for TenantManagerTypes {
    type TenantManagerError = GetActiveTimelineError;
    type TenantManager = TenantManagerWrapper;
-    type Timeline = Arc<Timeline>;
+    type Timeline = TenantManagerCacheItem;
 }

-impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
-    fn gate(&self) -> &utils::sync::gate::Gate {
-        &self.gate
-    }
+pub(crate) struct TenantManagerCacheItem {
+    pub(crate) timeline: Arc<Timeline>,
+    // allow() for cheap propagation through RequestContext inside a task
+    #[allow(clippy::redundant_allocation)]
+    pub(crate) metrics: Arc<Arc<TimelineMetrics>>,
+    #[allow(dead_code)] // we store it to keep the gate open
+    pub(crate) gate_guard: GateGuard,
+}

+impl std::ops::Deref for TenantManagerCacheItem {
+    type Target = Arc<Timeline>;
+    fn deref(&self) -> &Self::Target {
+        &self.timeline
+    }
+}
+
+impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
    fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
-        Timeline::shard_timeline_id(self)
+        Timeline::shard_timeline_id(&self.timeline)
    }

    fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
-        &self.handles
+        &self.timeline.handles
    }

    fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
-        Timeline::get_shard_identity(self)
+        Timeline::get_shard_identity(&self.timeline)
    }
 }

@@ -448,7 +457,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        &self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    ) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
        let tenant_id = self.tenant_id.get().expect("we set this in get()");
        let timeout = ACTIVE_TENANT_TIMEOUT;
        let wait_start = Instant::now();
@@ -491,7 +500,23 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        let timeline = tenant_shard
            .get_timeline(timeline_id, true)
            .map_err(GetActiveTimelineError::Timeline)?;
-        Ok(timeline)
+
+        let gate_guard = match timeline.gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => {
+                return Err(GetActiveTimelineError::Timeline(
+                    GetTimelineError::ShuttingDown,
+                ));
+            }
+        };
+
+        let metrics = Arc::new(Arc::clone(&timeline.metrics));
+
+        Ok(TenantManagerCacheItem {
+            timeline,
+            metrics,
+            gate_guard,
+        })
    }
 }

@@ -1220,6 +1245,14 @@ impl PageServerHandler {
        ),
        QueryError,
    > {
+        macro_rules! upgrade_handle_and_set_context {
+            ($shard:ident) => {{
+                let weak_handle = &$shard;
+                let handle = weak_handle.upgrade()?;
+                let ctx = ctx.with_scope_page_service_pagestream(&handle);
+                (handle, ctx)
+            }};
+        }
        Ok(match batch {
            BatchedFeMessage::Exists {
                span,
@@ -1228,9 +1261,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::exists");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_get_rel_exists_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1246,9 +1280,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::nblocks");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_get_nblocks_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1264,17 +1299,18 @@ impl PageServerHandler {
                pages,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::getpage");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    {
                        let npages = pages.len();
                        trace!(npages, "handling getpage request");
                        let res = self
                            .handle_get_page_at_lsn_request_batched(
-                                &*shard.upgrade()?,
+                                &shard,
                                effective_request_lsn,
                                pages,
                                io_concurrency,
-                                ctx,
+                                &ctx,
                            )
                            .instrument(span.clone())
                            .await;
@@ -1291,9 +1327,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::dbsize");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_db_size_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1309,9 +1346,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_get_slru_segment_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1327,12 +1365,13 @@ impl PageServerHandler {
                requests,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::test");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    {
                        let npages = requests.len();
                        trace!(npages, "handling getpage request");
                        let res = self
-                            .handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
+                            .handle_test_request_batch(&shard, requests, &ctx)
                            .instrument(span.clone())
                            .await;
                        assert_eq!(res.len(), npages);
@@ -2095,6 +2134,7 @@ impl PageServerHandler {
                // TODO: passthrough the error site to the final error message?
                BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
                BasebackupError::Server(e) => QueryError::Other(e),
+                BasebackupError::Shutdown => QueryError::Shutdown,
            }
        }

@@ -2107,6 +2147,7 @@ impl PageServerHandler {
            .get(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
        set_tracing_field_shard_id(&timeline);
+        let ctx = ctx.with_scope_timeline(&timeline);

        if timeline.is_archived() == Some(true) {
            tracing::info!(
@@ -2124,7 +2165,7 @@ impl PageServerHandler {
                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    crate::tenant::timeline::WaitLsnTimeout::Default,
-                    ctx,
+                    &ctx,
                )
                .await?;
            timeline
@@ -2150,7 +2191,7 @@ impl PageServerHandler {
                prev_lsn,
                full_backup,
                replica,
-                ctx,
+                &ctx,
            )
            .await
            .map_err(map_basebackup_error)?;
@@ -2173,7 +2214,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
-                    ctx,
+                    &ctx,
                )
                .await
                .map_err(map_basebackup_error)?;
@@ -2190,7 +2231,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
-                    ctx,
+                    &ctx,
                )
                .await
                .map_err(map_basebackup_error)?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
        // Otherwise, read the old reldir keyspace.
        // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.

-        if self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
+            self.get_rel_size_v2_status()
+        {
            // fetch directory listing (new)
            let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
            let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
                forknum: *forknum,
            }));

-        if !self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
            return Ok(rels_v1);
        }

@@ -599,28 +602,36 @@ impl Timeline {
        let n_blocks = self
            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
            .await?;
-        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
-        for blkno in 0..n_blocks {
-            let block = self
-                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
-                .await?;
-            segment.extend_from_slice(&block[..BLCKSZ as usize]);
-        }
-        Ok(segment.freeze())
-    }

-    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
-        &self,
-        kind: SlruKind,
-        segno: u32,
-        blknum: BlockNumber,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        assert!(self.tenant_shard_id.is_shard_zero());
-        let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn, ctx).await
+        let keyspace = KeySpace::single(
+            slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
+        );
+
+        let batches = keyspace.partition(
+            self.get_shard_identity(),
+            Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+        );
+
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for batch in batches.parts {
+            let blocks = self
+                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .await?;
+
+            for (_key, block) in blocks {
+                let block = block?;
+                segment.extend_from_slice(&block[..BLCKSZ as usize]);
+            }
+        }
+
+        Ok(segment.freeze())
    }

    /// Get size of an SLRU segment
@@ -829,19 +840,41 @@ impl Timeline {
            let nblocks = self
                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                .await?;
-            for blknum in (0..nblocks).rev() {
-                let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
+
+            let keyspace = KeySpace::single(
+                slru_block_to_key(SlruKind::Clog, segno, 0)
+                    ..slru_block_to_key(SlruKind::Clog, segno, nblocks),
+            );
+
+            let batches = keyspace.partition(
+                self.get_shard_identity(),
+                Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+            );
+
+            let io_concurrency = IoConcurrency::spawn_from_conf(
+                self.conf,
+                self.gate
+                    .enter()
+                    .map_err(|_| PageReconstructError::Cancelled)?,
+            );
+
+            for batch in batches.parts.into_iter().rev() {
+                let blocks = self
+                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
                    .await?;

-                if clog_page.len() == BLCKSZ as usize + 8 {
-                    let mut timestamp_bytes = [0u8; 8];
-                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
-                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+                for (_key, clog_page) in blocks.into_iter().rev() {
+                    let clog_page = clog_page?;

-                    match f(timestamp) {
-                        ControlFlow::Break(b) => return Ok(b),
-                        ControlFlow::Continue(()) => (),
+                    if clog_page.len() == BLCKSZ as usize + 8 {
+                        let mut timestamp_bytes = [0u8; 8];
+                        timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
+                        let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+
+                        match f(timestamp) {
+                            ControlFlow::Break(b) => return Ok(b),
+                            ControlFlow::Continue(()) => (),
+                        }
                    }
                }
            }
@@ -1052,6 +1085,8 @@ impl Timeline {
    ) -> Result<u64, CalculateLogicalSizeError> {
        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();

+        fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) });
+
        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
        let dbdir = DbDirectory::des(&buf)?;
@@ -1718,6 +1753,35 @@ impl DatadirModification<'_> {
        Ok(())
    }

+    /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
+    /// we enable it, we also need to persist it in `index_part.json`.
+    pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
+        let status = self.tline.get_rel_size_v2_status();
+        let config = self.tline.get_rel_size_v2_enabled();
+        match (config, status) {
+            (false, RelSizeMigration::Legacy) => {
+                // tenant config didn't enable it and we didn't write any reldir_v2 key yet
+                Ok(false)
+            }
+            (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                Ok(true)
+            }
+            (true, RelSizeMigration::Legacy) => {
+                // The first time we enable it, we need to persist it in `index_part.json`
+                self.tline
+                    .update_rel_size_v2_status(RelSizeMigration::Migrating)?;
+                tracing::info!("enabled rel_size_v2");
+                Ok(true)
+            }
+            (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                // and we don't need to do anything
+                Ok(true)
+            }
+        }
+    }
+
    /// Store a relmapper file (pg_filenode.map) in the repository
    pub async fn put_relmap_file(
        &mut self,
@@ -1726,6 +1790,8 @@ impl DatadirModification<'_> {
        img: Bytes,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+
        // Add it to the directory (if it doesn't exist already)
        let buf = self.get(DBDIR_KEY, ctx).await?;
        let mut dbdir = DbDirectory::des(&buf)?;
@@ -1746,7 +1812,7 @@ impl DatadirModification<'_> {
            })?;
            self.pending_directory_entries
                .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
-            if self.tline.get_rel_size_v2_enabled() {
+            if v2_enabled {
                self.pending_directory_entries
                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
            }
@@ -1898,12 +1964,12 @@ impl DatadirModification<'_> {
                .context("deserialize db")?
        };

-        // Add the new relation to the rel directory entry, and write it back
-        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            return Err(RelationError::AlreadyExists);
-        }
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;

-        if self.tline.get_rel_size_v2_enabled() {
+        if v2_enabled {
+            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
            let sparse_rel_dir_key =
                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
            // check if the rel_dir_key exists in v2
@@ -1938,6 +2004,10 @@ impl DatadirModification<'_> {
            self.pending_directory_entries
                .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
        } else {
+            // Add the new relation to the rel directory entry, and write it back
+            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
            if !dbdir_exists {
                self.pending_directory_entries
                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
@@ -1951,6 +2021,7 @@ impl DatadirModification<'_> {
                )),
            );
        }
+
        // Put size
        let size_key = rel_size_to_key(rel);
        let buf = nblocks.to_le_bytes();
@@ -2029,6 +2100,7 @@ impl DatadirModification<'_> {
        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
        for ((spc_node, db_node), rel_tags) in drop_relations {
            let dir_key = rel_dir_to_key(spc_node, db_node);
            let buf = self.get(dir_key, ctx).await?;
@@ -2041,7 +2113,7 @@ impl DatadirModification<'_> {
                        .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                    dirty = true;
                    true
-                } else if self.tline.get_rel_size_v2_enabled() {
+                } else if v2_enabled {
                    // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
                    // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
                    // logic).
@@ -2072,7 +2144,7 @@ impl DatadirModification<'_> {
                    // Remove entry from relation size cache
                    self.tline.remove_cached_rel_size(&rel_tag);

-                    // Delete size entry, as well as all blocks
+                    // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
                    self.delete(rel_key_range(rel_tag));
                }
            }
@@ -2686,7 +2758,7 @@ mod tests {
            TimelineId::from_array(hex!("11223344556677881122334455667788"));

        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
+        let (tline, ctx) = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,8 +31,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pageserver_api::models;
 pub use pageserver_api::models::TenantState;
+use pageserver_api::models::{self, RelSizeMigration};
 use pageserver_api::models::{
    CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
    WalRedoManagerStatus,
@@ -77,6 +77,8 @@ use self::timeline::{
    EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
 };
 use crate::config::PageServerConf;
+use crate::context;
+use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
@@ -1114,7 +1116,7 @@ impl Tenant {
            }
        };

-        let timeline = self.create_timeline_struct(
+        let (timeline, timeline_ctx) = self.create_timeline_struct(
            timeline_id,
            &metadata,
            previous_heatmap,
@@ -1123,6 +1125,8 @@ impl Tenant {
            CreateTimelineCause::Load,
            idempotency.clone(),
            index_part.gc_compaction.clone(),
+            index_part.rel_size_migration.clone(),
+            ctx,
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
@@ -1149,16 +1153,19 @@ impl Tenant {
        // a previous heatmap which contains all visible layers in the layer map.
        // This previous heatmap will be used whenever a fresh heatmap is generated
        // for the timeline.
-        if matches!(cause, LoadTimelineCause::Unoffload) {
+        if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
            let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
            while let Some((tline, end_lsn)) = tline_ending_at {
                let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
-                if !tline.is_previous_heatmap_active() {
+                // Another unearchived timeline might have generated a heatmap for this ancestor.
+                // If the current branch point greater than the previous one use the the heatmap
+                // we just generated - it should include more layers.
+                if !tline.should_keep_previous_heatmap(end_lsn) {
                    tline
                        .previous_heatmap
                        .store(Some(Arc::new(unarchival_heatmap)));
                } else {
-                    tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.")
+                    tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.")
                }

                match tline.ancestor_timeline() {
@@ -1253,7 +1260,7 @@ impl Tenant {
                        match activate {
                            ActivateTimelineArgs::Yes { broker_client } => {
                                info!("activating timeline after reload from pgdata import task");
-                                timeline.activate(self.clone(), broker_client, None, ctx);
+                                timeline.activate(self.clone(), broker_client, None, &timeline_ctx);
                            }
                            ActivateTimelineArgs::No => (),
                        }
@@ -1578,6 +1585,10 @@ impl Tenant {
    }

    async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
+        if !self.conf.load_previous_heatmap {
+            return None;
+        }
+
        let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
        match tokio::fs::read_to_string(on_disk_heatmap_path).await {
            Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
@@ -1757,6 +1768,7 @@ impl Tenant {
                        import_pgdata,
                        ActivateTimelineArgs::No,
                        guard,
+                        ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
                    ));
                }
            }
@@ -1774,6 +1786,7 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                remote_timeline_client,
+                ctx,
            )
            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
            .await
@@ -1939,6 +1952,7 @@ impl Tenant {
                hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
                    heatmap: h,
                    read_at: hs.1,
+                    end_lsn: None,
                })
            });
            part_downloads.spawn(
@@ -2210,7 +2224,7 @@ impl Tenant {
                self.clone(),
                broker_client.clone(),
                background_jobs_can_start,
-                &ctx,
+                &ctx.with_scope_timeline(&timeline),
            );
        }

@@ -2407,8 +2421,8 @@ impl Tenant {
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
-        _ctx: &RequestContext,
-    ) -> anyhow::Result<UninitializedTimeline> {
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
        anyhow::ensure!(
            self.is_active(),
            "Cannot create empty timelines on inactive tenant"
@@ -2442,6 +2456,8 @@ impl Tenant {
            create_guard,
            initdb_lsn,
            None,
+            None,
+            ctx,
        )
        .await
    }
@@ -2459,7 +2475,7 @@ impl Tenant {
        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_tl = self
+        let (uninit_tl, ctx) = self
            .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
            .await?;
        let tline = uninit_tl.raw_timeline().expect("we just created it");
@@ -2471,7 +2487,7 @@ impl Tenant {
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
-            .commit(ctx)
+            .commit(&ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

@@ -2497,6 +2513,7 @@ impl Tenant {
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
+        in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
@@ -2518,6 +2535,11 @@ impl Tenant {
                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                .await?;
        }
+        for in_memory in in_memory_layer_desc {
+            tline
+                .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
+                .await?;
+        }
        let layer_names = tline
            .layers
            .read()
@@ -2683,7 +2705,12 @@ impl Tenant {
        // doing stuff before the IndexPart is durable in S3, which is done by the previous section.
        let activated_timeline = match result {
            CreateTimelineResult::Created(timeline) => {
-                timeline.activate(self.clone(), broker_client, None, ctx);
+                timeline.activate(
+                    self.clone(),
+                    broker_client,
+                    None,
+                    &ctx.with_scope_timeline(&timeline),
+                );
                timeline
            }
            CreateTimelineResult::Idempotent(timeline) => {
@@ -2745,10 +2772,9 @@ impl Tenant {
            }
        };

-        let mut uninit_timeline = {
+        let (mut uninit_timeline, timeline_ctx) = {
            let this = &self;
            let initdb_lsn = Lsn(0);
-            let _ctx = ctx;
            async move {
                let new_metadata = TimelineMetadata::new(
                    // Initialize disk_consistent LSN to 0, The caller must import some data to
@@ -2767,6 +2793,8 @@ impl Tenant {
                    timeline_create_guard,
                    initdb_lsn,
                    None,
+                    None,
+                    ctx,
                )
                .await
            }
@@ -2796,6 +2824,7 @@ impl Tenant {
            index_part,
            activate,
            timeline_create_guard,
+            timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
        ));

        // NB: the timeline doesn't exist in self.timelines at this point
@@ -2809,6 +2838,7 @@ impl Tenant {
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
        timeline_create_guard: TimelineCreateGuard,
+        ctx: RequestContext,
    ) {
        debug_assert_current_span_has_tenant_and_timeline_id();
        info!("starting");
@@ -2820,6 +2850,7 @@ impl Tenant {
                index_part,
                activate,
                timeline_create_guard,
+                ctx,
            )
            .await;
        if let Err(err) = &res {
@@ -2835,9 +2866,8 @@ impl Tenant {
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
        timeline_create_guard: TimelineCreateGuard,
+        ctx: RequestContext,
    ) -> Result<(), anyhow::Error> {
-        let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
-
        info!("importing pgdata");
        import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
            .await
@@ -3046,6 +3076,7 @@ impl Tenant {

            let mut has_pending_l0 = false;
            for timeline in compact_l0 {
+                let ctx = &ctx.with_scope_timeline(&timeline);
                let outcome = timeline
                    .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
                    .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
@@ -3079,6 +3110,7 @@ impl Tenant {
            if !timeline.is_active() {
                continue;
            }
+            let ctx = &ctx.with_scope_timeline(&timeline);

            let mut outcome = timeline
                .compact(cancel, EnumSet::default(), ctx)
@@ -3141,11 +3173,13 @@ impl Tenant {
    /// Trips the compaction circuit breaker if appropriate.
    pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
        match err {
+            err if err.is_cancel() => {}
            CompactionError::ShuttingDown => (),
            // Offload failures don't trip the circuit breaker, since they're cheap to retry and
            // shouldn't block compaction.
            CompactionError::Offload(_) => {}
            CompactionError::CollectKeySpaceError(err) => {
+                // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
@@ -3302,7 +3336,7 @@ impl Tenant {
                    self.clone(),
                    broker_client.clone(),
                    background_jobs_can_start,
-                    ctx,
+                    &ctx.with_scope_timeline(timeline),
                );
                activated_timelines += 1;
            }
@@ -4116,7 +4150,9 @@ impl Tenant {
        cause: CreateTimelineCause,
        create_idempotency: CreateTimelineIdempotency,
        gc_compaction_state: Option<GcCompactionState>,
-    ) -> anyhow::Result<Arc<Timeline>> {
+        rel_size_v2_status: Option<RelSizeMigration>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(Arc<Timeline>, RequestContext)> {
        let state = match cause {
            CreateTimelineCause::Load => {
                let ancestor_id = new_metadata.ancestor_timeline();
@@ -4148,10 +4184,15 @@ impl Tenant {
            self.attach_wal_lag_cooldown.clone(),
            create_idempotency,
            gc_compaction_state,
+            rel_size_v2_status,
            self.cancel.child_token(),
        );

-        Ok(timeline)
+        let timeline_ctx = RequestContextBuilder::extend(ctx)
+            .scope(context::Scope::new_timeline(&timeline))
+            .build();
+
+        Ok((timeline, timeline_ctx))
    }

    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
@@ -4567,6 +4608,7 @@ impl Tenant {
        // Ensures all timelines use the same start time when computing the time cutoff.
        let now_ts_for_pitr_calc = SystemTime::now();
        for timeline in timelines.iter() {
+            let ctx = &ctx.with_scope_timeline(timeline);
            let cutoff = timeline
                .get_last_record_lsn()
                .checked_sub(horizon)
@@ -4740,7 +4782,7 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

@@ -4843,13 +4885,15 @@ impl Tenant {
            src_timeline.pg_version,
        );

-        let uninitialized_timeline = self
+        let (uninitialized_timeline, _timeline_ctx) = self
            .prepare_new_timeline(
                dst_id,
                &metadata,
                timeline_create_guard,
                start_lsn + 1,
                Some(Arc::clone(src_timeline)),
+                Some(src_timeline.get_rel_size_v2_status()),
+                ctx,
            )
            .await?;

@@ -5116,13 +5160,15 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let mut raw_timeline = self
+        let (mut raw_timeline, timeline_ctx) = self
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
                timeline_create_guard,
                pgdata_lsn,
                None,
+                None,
+                ctx,
            )
            .await?;

@@ -5133,7 +5179,7 @@ impl Tenant {
                    &unfinished_timeline,
                    &pgdata_path,
                    pgdata_lsn,
-                    ctx,
+                    &timeline_ctx,
                )
                .await
                .with_context(|| {
@@ -5194,6 +5240,7 @@ impl Tenant {
    /// An empty layer map is initialized, and new data and WAL can be imported starting
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
    /// `finish_creation` to insert the Timeline into the timelines map.
+    #[allow(clippy::too_many_arguments)]
    async fn prepare_new_timeline<'a>(
        &'a self,
        new_timeline_id: TimelineId,
@@ -5201,15 +5248,17 @@ impl Tenant {
        create_guard: TimelineCreateGuard,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
-    ) -> anyhow::Result<UninitializedTimeline<'a>> {
+        rel_size_v2_status: Option<RelSizeMigration>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> {
        let tenant_shard_id = self.tenant_shard_id;

        let resources = self.build_timeline_resources(new_timeline_id);
        resources
            .remote_client
-            .init_upload_queue_for_empty_remote(new_metadata)?;
+            .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;

-        let timeline_struct = self
+        let (timeline_struct, timeline_ctx) = self
            .create_timeline_struct(
                new_timeline_id,
                new_metadata,
@@ -5219,6 +5268,8 @@ impl Tenant {
                CreateTimelineCause::Load,
                create_guard.idempotency.clone(),
                None,
+                rel_size_v2_status,
+                ctx,
            )
            .context("Failed to create timeline data structure")?;

@@ -5239,10 +5290,13 @@ impl Tenant {
            "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}"
        );

-        Ok(UninitializedTimeline::new(
-            self,
-            new_timeline_id,
-            Some((timeline_struct, create_guard)),
+        Ok((
+            UninitializedTimeline::new(
+                self,
+                new_timeline_id,
+                Some((timeline_struct, create_guard)),
+            ),
+            timeline_ctx,
        ))
    }

@@ -5777,7 +5831,8 @@ pub(crate) mod harness {
        }

        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
-            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
+                .with_scope_unit_test();
            (
                self.do_try_load(&ctx)
                    .await
@@ -5907,6 +5962,8 @@ mod tests {
    #[cfg(feature = "testing")]
    use timeline::GcInfo;
    #[cfg(feature = "testing")]
+    use timeline::InMemoryLayerTestDesc;
+    #[cfg(feature = "testing")]
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    use timeline::{CompactOptions, DeltaLayerTestDesc};
    use utils::id::TenantId;
@@ -6798,7 +6855,7 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let io_concurrency = IoConcurrency::spawn_for_test();
-        let tline = tenant
+        let (tline, ctx) = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();
@@ -7420,7 +7477,7 @@ mod tests {
            .await;

        let initdb_lsn = Lsn(0x20);
-        let utline = tenant
+        let (utline, ctx) = tenant
            .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = utline.raw_timeline().unwrap();
@@ -7487,7 +7544,7 @@ mod tests {
        let harness = TenantHarness::create(name).await?;
        {
            let (tenant, ctx) = harness.load().await;
-            let tline = tenant
+            let (tline, _ctx) = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
@@ -7919,6 +7976,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(), // delta layers
                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
@@ -8006,6 +8064,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(), // delta layers
                vec![(
                    Lsn(0x20),
@@ -8221,6 +8280,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8301,6 +8361,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8374,6 +8435,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8506,6 +8568,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -8699,6 +8762,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    Lsn(0x10)..Lsn(0x40),
                    delta1,
@@ -8755,6 +8819,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(),
                image_layers,
                end_lsn,
@@ -8961,6 +9026,7 @@ mod tests {
                    Lsn(0x08),
                    DEFAULT_PG_VERSION,
                    &ctx,
+                    Vec::new(), // in-memory layers
                    vec![
                        DeltaLayerTestDesc::new_with_inferred_key_range(
                            Lsn(0x08)..Lsn(0x10),
@@ -8979,7 +9045,7 @@ mod tests {
                            delta3,
                        ),
                    ], // delta layers
-                    vec![], // image layers
+                    vec![],     // image layers
                    Lsn(0x50),
                )
                .await?
@@ -8990,6 +9056,7 @@ mod tests {
                    Lsn(0x10),
                    DEFAULT_PG_VERSION,
                    &ctx,
+                    Vec::new(), // in-memory layers
                    vec![
                        DeltaLayerTestDesc::new_with_inferred_key_range(
                            Lsn(0x10)..Lsn(0x48),
@@ -9540,6 +9607,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
@@ -9787,6 +9855,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    // delta1 and delta 2 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
@@ -10022,6 +10091,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![],                       // in-memory layers
                vec![],                       // delta layers
                vec![(Lsn(0x18), img_layer)], // image layers
                Lsn(0x18),
@@ -10268,6 +10338,7 @@ mod tests {
                baseline_image_layer_lsn,
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    delta_layer_start_lsn..delta_layer_end_lsn,
                    delta_layer_spec,
@@ -10299,6 +10370,158 @@ mod tests {
        Ok(())
    }

+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
+        let harness =
+            TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let frozen_layer = {
+            let lsn_range = Lsn(0x40)..Lsn(0x60);
+            let mut data = Vec::new();
+            for i in 0..10 {
+                let key = get_key(i);
+                let key_in_nested = nested_img_layer
+                    .iter()
+                    .any(|(key_with_img, _)| *key_with_img == key);
+                let lsn = {
+                    if key_in_nested {
+                        Lsn(nested_image_layer_lsn.0 + 5)
+                    } else {
+                        lsn_range.start
+                    }
+                };
+
+                let will_init = will_init_keys.contains(&i);
+                if will_init {
+                    data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
+
+                    expected_key_values.insert(key, "".to_string());
+                } else {
+                    let delta = format!("@{lsn}");
+                    data.push((
+                        key,
+                        lsn,
+                        Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                    ));
+
+                    expected_key_values
+                        .get_mut(&key)
+                        .expect("An image exists for each key")
+                        .push_str(delta.as_str());
+                }
+            }
+
+            InMemoryLayerTestDesc {
+                lsn_range,
+                is_open: false,
+                data,
+            }
+        };
+
+        let (open_layer, last_record_lsn) = {
+            let start_lsn = Lsn(0x70);
+            let mut data = Vec::new();
+            let mut end_lsn = Lsn(0);
+            for i in 0..10 {
+                let key = get_key(i);
+                let lsn = Lsn(start_lsn.0 + i as u64);
+                let delta = format!("@{lsn}");
+                data.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+
+                end_lsn = std::cmp::max(end_lsn, lsn);
+            }
+
+            (
+                InMemoryLayerTestDesc {
+                    lsn_range: start_lsn..Lsn::MAX,
+                    is_open: true,
+                    data,
+                },
+                end_lsn,
+            )
+        };
+
+        assert!(
+            nested_image_layer_lsn > frozen_layer.lsn_range.start
+                && nested_image_layer_lsn < frozen_layer.lsn_range.end
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![open_layer, frozen_layer], // in-memory layers
+                Vec::new(),                     // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                last_record_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value.clone()));
+
+            tracing::info!("key={key} value={expected_value}");
+        }
+
+        Ok(())
+    }
+
    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
        (
            k1.is_delta,
@@ -10414,6 +10637,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -10798,6 +11022,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    // delta1/2/4 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
@@ -11049,6 +11274,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    // delta1/2/4 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -382,7 +382,8 @@ pub(crate) mod tests {
    }

    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let (_temp_dir, pathbuf, offsets) =
            write_maybe_compressed(blobs, compression, &ctx).await?;

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -32,8 +32,7 @@ use hex;
 use thiserror::Error;
 use tracing::error;

-use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::TaskKind;
+use crate::context::RequestContext;
 use crate::tenant::block_io::{BlockReader, BlockWriter};
 use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer};

@@ -478,16 +477,15 @@ where
    }

    #[allow(dead_code)]
-    pub async fn dump(&self) -> Result<()> {
+    pub async fn dump(&self, ctx: &RequestContext) -> Result<()> {
        let mut stack = Vec::new();
-        let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

        stack.push((self.root_blk, String::new(), 0, 0, 0));

        let block_cursor = self.reader.block_cursor();

        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum, ctx).await?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -836,6 +834,8 @@ pub(crate) mod tests {
    use rand::Rng;

    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};

    #[derive(Clone, Default)]
@@ -870,7 +870,8 @@ pub(crate) mod tests {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        let all_keys: Vec<&[u8; 6]> = vec![
            b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
@@ -888,7 +889,7 @@ pub(crate) mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump(&ctx).await?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -980,7 +981,8 @@ pub(crate) mod tests {
    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        const NUM_KEYS: u64 = 1000;

@@ -998,7 +1000,7 @@ pub(crate) mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump(&ctx).await?;

        use std::sync::Mutex;

@@ -1168,7 +1170,8 @@ pub(crate) mod tests {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        for (key, val) in disk_btree_test_data::TEST_DATA {
            writer.append(&key, val)?;
@@ -1199,7 +1202,7 @@ pub(crate) mod tests {
            .await?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump().await?;
+        reader.dump(&ctx).await?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -352,7 +352,8 @@ mod tests {
        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
        fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;

-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        Ok((conf, tenant_shard_id, timeline_id, ctx))
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
    /// The latest state
    head: LayerCoverageTuple<Value>,

+    /// TODO: this could be an ordered vec using binary search.
+    /// We push into this map everytime we add a layer, so might see some benefit
    /// All previous states
    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
 }
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
    buffer: BTreeMap<LayerKey, Option<Value>>,

    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
+    // [`Self::historic_coverage`] instead of doubling memory usage.
+    // [`Self::len`]: can require rebuild and serve from latest historic
+    // [`Self::iter`]: already requires rebuild => can serve from latest historic
    layers: BTreeMap<LayerKey, Value>,
 }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -194,7 +194,7 @@ pub(crate) use download::{
 };
 use index::GcCompactionState;
 pub(crate) use index::LayerFileMetadata;
-use pageserver_api::models::TimelineArchivalState;
+use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use regex::Regex;
 use remote_storage::{
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {

    /// Initialize the upload queue for the case where the remote storage is empty,
    /// i.e., it doesn't have an `IndexPart`.
+    ///
+    /// `rel_size_v2_status` needs to be carried over during branching, and that's why
+    /// it's passed in here.
    pub fn init_upload_queue_for_empty_remote(
        &self,
        local_metadata: &TimelineMetadata,
+        rel_size_v2_status: Option<RelSizeMigration>,
    ) -> anyhow::Result<()> {
        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
        // certainly no point in starting more upload tasks than this.
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
            .as_ref()
            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        let initialized_queue =
+            upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
        self.update_remote_physical_size_gauge(None);
        info!("initialized upload queue as empty");
        Ok(())
@@ -900,7 +906,7 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
    pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
        self: &Arc<Self>,
        gc_compaction_state: GcCompactionState,
@@ -912,6 +918,21 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
+    pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
+        self: &Arc<Self>,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
+        // TODO: allow this operation to bypass the validation check because we might upload the index part
+        // with no layers but the flag updated. For now, we just modify the index part in memory and the next
+        // upload will include the flag.
+        // self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -933,6 +954,14 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
+    pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
    /// Launch an index-file upload operation in the background (internal function)
    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,6 +7,7 @@ use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::shard::ShardIndex;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
    pub(crate) last_completed_lsn: Lsn,
 }

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub enum RelSizeMigration {
-    /// The tenant is using the old rel_size format.
-    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
-    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
-    Legacy,
-    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
-    /// persisted in the index part. The read path will read both formats and merge them.
-    Migrating,
-    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
-    /// in the index part, and the read path will not read the old format.
-    Migrated,
-}
-
 impl IndexPart {
    /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
    /// used to understand later versions.
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -491,7 +491,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
        let remote_storage = self.remote_storage.clone();
        let conf = self.tenant_manager.get_conf();
        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
-        let download_ctx = self.root_ctx.attached_child();
+        let download_ctx = self
+            .root_ctx
+            .attached_child()
+            .with_scope_secondary_tenant(&tenant_shard_id);
        (RunningDownload { barrier }, Box::pin(async move {
            let _completion = completion;

@@ -771,6 +774,7 @@ impl<'a> TenantDownloader<'a> {

        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
+            let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id);
            let timeline_state = timeline_states
                .remove(&timeline.timeline_id)
                .expect("Just populated above");
@@ -869,8 +873,7 @@ impl<'a> TenantDownloader<'a> {
                let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();

                let layers_in_heatmap = heatmap_timeline
-                    .layers
-                    .iter()
+                    .hot_layers()
                    .map(|l| (&l.name, l.metadata.generation))
                    .collect::<HashSet<_>>();
                let layers_on_disk = timeline_state
@@ -1015,7 +1018,8 @@ impl<'a> TenantDownloader<'a> {
        // Accumulate updates to the state
        let mut touched = Vec::new();

-        for layer in timeline.layers {
+        let timeline_id = timeline.timeline_id;
+        for layer in timeline.into_hot_layers() {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
                return (Err(UpdateError::Cancelled), touched);
@@ -1040,7 +1044,7 @@ impl<'a> TenantDownloader<'a> {
            }

            match self
-                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .download_layer(tenant_shard_id, &timeline_id, layer, ctx)
                .await
            {
                Ok(Some(layer)) => touched.push(layer),
@@ -1148,7 +1152,7 @@ impl<'a> TenantDownloader<'a> {
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
        let timeline_id = timeline.timeline_id;

-        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());

        let (result, touched) = self
            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
@@ -1316,11 +1320,11 @@ async fn init_timeline_state(
    // As we iterate through layers found on disk, we will look up their metadata from this map.
    // Layers not present in metadata will be discarded.
    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
-        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+        heatmap.hot_layers().map(|l| (&l.name, l)).collect();

    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
        if let Some(last_heatmap) = last_heatmap {
-            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+            last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
        } else {
            HashMap::new()
        };
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
    #[serde_as(as = "DisplayFromStr")]
    pub(crate) timeline_id: TimelineId,

-    pub(crate) layers: Vec<HeatMapLayer>,
+    layers: Vec<HeatMapLayer>,
 }

 #[serde_as]
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(crate) access_time: SystemTime,
-    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
-    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
+
+    #[serde(default)]
+    pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
+                           // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
 }

 impl HeatMapLayer {
@@ -62,11 +64,13 @@ impl HeatMapLayer {
        name: LayerName,
        metadata: LayerFileMetadata,
        access_time: SystemTime,
+        cold: bool,
    ) -> Self {
        Self {
            name,
            metadata,
            access_time,
+            cold,
        }
    }
 }
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
            layers,
        }
    }
+
+    pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
+        self.layers.into_iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter()
+    }
 }

 pub(crate) struct HeatMapStats {
@@ -92,7 +108,7 @@ impl HeatMapTenant {
            layers: 0,
        };
        for timeline in &self.timelines {
-            for layer in &timeline.layers {
+            for layer in timeline.hot_layers() {
                stats.layers += 1;
                stats.bytes += layer.metadata.file_size;
            }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -474,7 +474,7 @@ async fn fill_logical_sizes(
            if cached_size.is_none() {
                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
                let parallel_size_calcs = Arc::clone(limit);
-                let ctx = ctx.attached_child();
+                let ctx = ctx.attached_child().with_scope_timeline(&timeline);
                joinset.spawn(
                    calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
                        .in_current_span(),
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;

 use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
+use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
 use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -721,6 +722,12 @@ struct LayerToVisitId {
    lsn_floor: Lsn,
 }

+#[derive(Debug, PartialEq, Eq, Hash)]
+pub enum ReadableLayerWeak {
+    PersistentLayer(Arc<PersistentLayerDesc>),
+    InMemoryLayer(InMemoryLayerDesc),
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -873,7 +880,7 @@ impl ReadableLayer {
            }
            ReadableLayer::InMemoryLayer(layer) => {
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1385,7 +1385,7 @@ impl DeltaLayerInner {
            block_reader,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump(ctx).await?;

        let keys = self.index_entries(ctx).await?;

@@ -2024,6 +2024,7 @@ pub(crate) mod test {
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
            .await
            .unwrap();
+        let ctx = &ctx.with_scope_timeline(&timeline);

        let initdb_layer = timeline
            .layers
@@ -2136,7 +2137,7 @@ pub(crate) mod test {
            .await
            .unwrap();

-            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
+            let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap();

            new_layer
                .copy_delta_prefix(&mut writer, truncate_at, ctx)
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -208,7 +208,7 @@ impl ImageLayerInner {
            block_reader,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump(ctx).await?;

        tree_reader
            .visit(
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -416,7 +416,7 @@ impl InMemoryLayer {
    pub(crate) async fn get_values_reconstruct_data(
        self: &Arc<InMemoryLayer>,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
@@ -433,8 +433,6 @@ impl InMemoryLayer {
        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();

-        let lsn_range = self.start_lsn..end_lsn;
-
        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
                .index
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -324,16 +324,16 @@ impl Layer {
        reconstruct_data: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let downloaded = self
-            .0
-            .get_or_maybe_download(true, Some(ctx))
-            .await
-            .map_err(|err| match err {
-                DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
-                    GetVectoredError::Cancelled
-                }
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
+        let downloaded =
+            self.0
+                .get_or_maybe_download(true, ctx)
+                .await
+                .map_err(|err| match err {
+                    DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
+                        GetVectoredError::Cancelled
+                    }
+                    other => GetVectoredError::Other(anyhow::anyhow!(other)),
+                })?;
        let this = ResidentLayer {
            downloaded: downloaded.clone(),
            owner: self.clone(),
@@ -356,8 +356,8 @@ impl Layer {
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
-        self.0.get_or_maybe_download(true, None).await?;
+    pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
+        self.0.get_or_maybe_download(true, ctx).await?;
        Ok(())
    }

@@ -392,8 +392,11 @@ impl Layer {
    }

    /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
-    pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
-        let downloaded = self.0.get_or_maybe_download(true, None).await?;
+    pub(crate) async fn download_and_keep_resident(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<ResidentLayer, DownloadError> {
+        let downloaded = self.0.get_or_maybe_download(true, ctx).await?;

        Ok(ResidentLayer {
            downloaded,
@@ -446,7 +449,7 @@ impl Layer {

        if verbose {
            // for now, unconditionally download everything, even if that might not be wanted.
-            let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
+            let l = self.0.get_or_maybe_download(true, ctx).await?;
            l.dump(&self.0, ctx).await?
        }

@@ -945,7 +948,7 @@ impl LayerInner {
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
-        ctx: Option<&RequestContext>,
+        ctx: &RequestContext,
    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
        let (weak, permit) = {
            // get_or_init_detached can:
@@ -1035,21 +1038,14 @@ impl LayerInner {
            return Err(DownloadError::NotFile(ft));
        }

-        if let Some(ctx) = ctx {
-            self.check_expected_download(ctx)?;
-        }
+        self.check_expected_download(ctx)?;

        if !allow_download {
            // this is only used from tests, but it is hard to test without the boolean
            return Err(DownloadError::DownloadRequired);
        }

-        let download_ctx = ctx
-            .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
-            .unwrap_or(RequestContext::new(
-                TaskKind::LayerDownload,
-                DownloadBehavior::Download,
-            ));
+        let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download);

        async move {
            tracing::info!(%reason, "downloading on-demand");
@@ -1567,10 +1563,10 @@ impl LayerInner {

        self.access_stats.record_residence_event();

-        self.status.as_ref().unwrap().send_replace(Status::Evicted);
-
        *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());

+        self.status.as_ref().unwrap().send_replace(Status::Evicted);
+
        Ok(())
    }

--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -8,7 +8,6 @@ use utils::id::TimelineId;
 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
 use crate::context::DownloadBehavior;
-use crate::task_mgr::TaskKind;
 use crate::tenant::harness::{TenantHarness, test_img};
 use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};

@@ -27,11 +26,9 @@ async fn smoke_test() {
    let h = TenantHarness::create("smoke_test").await.unwrap();
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-    let (tenant, _) = h.load().await;
+    let (tenant, ctx) = h.load().await;
    let io_concurrency = IoConcurrency::spawn_for_test();

-    let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
-
    let image_layers = vec![(
        Lsn(0x40),
        vec![(
@@ -49,12 +46,14 @@ async fn smoke_test() {
            Lsn(0x10),
            14,
            &ctx,
+            Default::default(), // in-memory layers
            Default::default(),
            image_layers,
            Lsn(0x100),
        )
        .await
        .unwrap();
+    let ctx = &ctx.with_scope_timeline(&timeline);

    // Grab one of the timeline's layers to exercise in the test, and the other layer that is just
    // there to avoid the timeline being illegally empty
@@ -93,7 +92,7 @@ async fn smoke_test() {
                controlfile_keyspace.clone(),
                Lsn(0x10)..Lsn(0x11),
                &mut data,
-                &ctx,
+                ctx,
            )
            .await
            .unwrap();
@@ -128,7 +127,7 @@ async fn smoke_test() {
                controlfile_keyspace.clone(),
                Lsn(0x10)..Lsn(0x11),
                &mut data,
-                &ctx,
+                ctx,
            )
            .instrument(download_span.clone())
            .await
@@ -178,7 +177,7 @@ async fn smoke_test() {

    // plain downloading is rarely needed
    layer
-        .download_and_keep_resident()
+        .download_and_keep_resident(ctx)
        .instrument(download_span)
        .await
        .unwrap();
@@ -340,6 +339,7 @@ fn read_wins_pending_eviction() {
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
            .unwrap();
+        let ctx = ctx.with_scope_timeline(&timeline);

        let layer = {
            let mut layers = {
@@ -379,7 +379,7 @@ fn read_wins_pending_eviction() {
        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
        layer
            .0
-            .get_or_maybe_download(false, None)
+            .get_or_maybe_download(false, &ctx)
            .instrument(download_span)
            .await
            .expect("should had reinitialized without downloading");
@@ -472,6 +472,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
            .unwrap();
+        let ctx = ctx.with_scope_timeline(&timeline);

        let layer = {
            let mut layers = {
@@ -514,7 +515,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
        layer
            .0
-            .get_or_maybe_download(false, None)
+            .get_or_maybe_download(false, &ctx)
            .instrument(download_span)
            .await
            .expect("should had reinitialized without downloading");
@@ -641,7 +642,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();
+    let ctx = ctx.with_scope_timeline(&timeline);

+    // This test does downloads
+    let ctx = RequestContextBuilder::extend(&ctx)
+        .download_behavior(DownloadBehavior::Download)
+        .build();
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
@@ -674,7 +680,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    // simulate a cancelled read which is cancelled before it gets to re-initialize
    let e = layer
        .0
-        .get_or_maybe_download(false, None)
+        .get_or_maybe_download(false, &ctx)
        .await
        .unwrap_err();
    assert!(
@@ -698,7 +704,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    // failpoint is still enabled, but it is not hit
    let e = layer
        .0
-        .get_or_maybe_download(false, None)
+        .get_or_maybe_download(false, &ctx)
        .await
        .unwrap_err();
    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
@@ -721,6 +727,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();
+    let ctx = ctx.with_scope_timeline(&timeline);
+
+    // This test does downloads
+    let ctx = RequestContextBuilder::extend(&ctx)
+        .download_behavior(DownloadBehavior::Download)
+        .build();

    let layer = {
        let mut layers = {
@@ -768,7 +780,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let mut download = std::pin::pin!(
        layer
            .0
-            .get_or_maybe_download(true, None)
+            .get_or_maybe_download(true, &ctx)
            .instrument(download_span)
    );

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -289,15 +289,14 @@ fn log_compaction_error(
 ) {
    use CompactionError::*;

-    use crate::pgdatadir_mapping::CollectKeySpaceError;
    use crate::tenant::PageReconstructError;
    use crate::tenant::upload_queue::NotInitialized;

    let level = match err {
+        e if e.is_cancel() => return,
        ShuttingDown => return,
        Offload(_) => Level::ERROR,
        AlreadyRunning(_) => Level::ERROR,
-        CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
        CollectKeySpaceError(_) => Level::ERROR,
        _ if task_cancelled => Level::INFO,
        Other(err) => {
@@ -474,21 +473,15 @@ async fn wait_for_active_tenant(
    }

    let mut update_rx = tenant.subscribe_for_state_updates();
-    loop {
-        tokio::select! {
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            result = update_rx.changed() => if result.is_err() {
+    tokio::select! {
+        result = update_rx.wait_for(|s| s == &TenantState::Active) => {
+            if result.is_err() {
                return ControlFlow::Break(());
            }
-        }
-
-        match &*update_rx.borrow() {
-            TenantState::Active => {
-                debug!("Tenant state changed to active, continuing the task loop");
-                return ControlFlow::Continue(());
-            }
-            state => debug!("Not running the task loop, tenant is not active: {state:?}"),
-        }
+            debug!("Tenant state changed to active, continuing the task loop");
+            ControlFlow::Continue(())
+        },
+        _ = cancel.cancelled() => ControlFlow::Break(()),
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
 use pageserver_api::models::{
    CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
+    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
 };
 use pageserver_api::reltag::{BlockNumber, RelTag};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::metrics::{
-    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
+    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
+    LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
 };
 use crate::page_service::TenantManagerTypes;
 use crate::pgdatadir_mapping::{
@@ -286,7 +287,7 @@ pub struct Timeline {
    // The LSN of gc-compaction that was last applied to this timeline.
    gc_compaction_state: ArcSwap<Option<GcCompactionState>>,

-    pub(super) metrics: TimelineMetrics,
+    pub(crate) metrics: Arc<TimelineMetrics>,

    // `Timeline` doesn't write these metrics itself, but it manages the lifetime.  Code
    // in `crate::page_service` writes these metrics.
@@ -436,12 +437,16 @@ pub struct Timeline {
    /// May host a background Tokio task which downloads all the layers from the current
    /// heatmap on demand.
    heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
+
+    pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
 }

 pub(crate) enum PreviousHeatmap {
    Active {
        heatmap: HeatMapTimeline,
        read_at: std::time::Instant,
+        // End LSN covered by the heatmap if known
+        end_lsn: Option<Lsn>,
    },
    Obsolete,
 }
@@ -1326,10 +1331,6 @@ impl Timeline {
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
        if !results.is_empty() {
-            // Record the total number of layers visited towards each key in the batch. While some
-            // layers may not intersect with a given read, and the cost of layer visits are
-            // amortized across the batch, each visited layer contributes directly to the observed
-            // latency for every read in the batch, which is what we care about.
            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
@@ -1344,9 +1345,23 @@ impl Timeline {
                });
            }

+            // Records the number of layers visited in a few different ways:
+            //
+            // * LAYERS_PER_READ: all layers count towards every read in the batch, because each
+            //   layer directly affects its observed latency.
+            //
+            // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
+            //   layer visits and access cost.
+            //
+            // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
+            //   read amplification after batching.
+            let layers_visited = layers_visited as f64;
+            let avg_layers_visited = layers_visited / results.len() as f64;
+            LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
            for _ in &results {
-                self.metrics.layers_per_read.observe(layers_visited as f64);
-                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+                self.metrics.layers_per_read.observe(layers_visited);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited);
+                LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
            }
        }

@@ -1864,16 +1879,25 @@ impl Timeline {
        };

        // Signal compaction failure to avoid L0 flush stalls when it's broken.
-        match result {
+        match &result {
            Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
-            Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
+            Err(e) if e.is_cancel() => {}
+            Err(CompactionError::ShuttingDown) => {
+                // Covered by the `Err(e) if e.is_cancel()` branch.
+            }
+            Err(CompactionError::AlreadyRunning(_)) => {
+                // Covered by the `Err(e) if e.is_cancel()` branch.
+            }
+            Err(CompactionError::Other(_)) => {
+                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
+            }
+            Err(CompactionError::CollectKeySpaceError(_)) => {
+                // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
            }
            // Don't change the current value on offload failure or shutdown. We don't want to
            // abruptly stall nor resume L0 flushes in these cases.
            Err(CompactionError::Offload(_)) => {}
-            Err(CompactionError::ShuttingDown) => {}
-            Err(CompactionError::AlreadyRunning(_)) => {}
        };

        result
@@ -2188,6 +2212,7 @@ impl Timeline {
    pub(crate) async fn download_layer(
        &self,
        layer_file_name: &LayerName,
+        ctx: &RequestContext,
    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
        let Some(layer) = self
            .find_layer(layer_file_name)
@@ -2201,7 +2226,7 @@ impl Timeline {
            return Ok(None);
        };

-        layer.download().await?;
+        layer.download(ctx).await?;

        Ok(Some(true))
    }
@@ -2356,6 +2381,9 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
+    /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
+    /// possible that the index part persists the state while the config doesn't get persisted.
    pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2364,6 +2392,14 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
    }

+    pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
+        self.rel_size_v2_status
+            .load()
+            .as_ref()
+            .map(|s| s.as_ref().clone())
+            .unwrap_or(RelSizeMigration::Legacy)
+    }
+
    fn get_compaction_upper_limit(&self) -> usize {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2624,6 +2660,7 @@ impl Timeline {
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        create_idempotency: crate::tenant::CreateTimelineIdempotency,
        gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2648,14 +2685,14 @@ impl Timeline {
        }

        Arc::new_cyclic(|myself| {
-            let metrics = TimelineMetrics::new(
+            let metrics = Arc::new(TimelineMetrics::new(
                &tenant_shard_id,
                &timeline_id,
                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
                    "mtime",
                    evictions_low_residence_duration_metric_threshold,
                ),
-            );
+            ));
            let aux_file_metrics = metrics.aux_file_size_gauge.clone();

            let mut result = Timeline {
@@ -2782,6 +2819,8 @@ impl Timeline {
                previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),

                heatmap_layers_downloader: Mutex::new(None),
+
+                rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
            };

            result.repartition_threshold =
@@ -2837,7 +2876,7 @@ impl Timeline {
            "layer flush task",
            async move {
                let _guard = guard;
-                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error).with_scope_timeline(&self_clone);
                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
                assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
@@ -2858,6 +2897,16 @@ impl Timeline {
            .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
    }

+    pub(crate) fn update_rel_size_v2_status(
+        &self,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        self.rel_size_v2_status
+            .store(Some(Arc::new(rel_size_v2_status.clone())));
+        self.remote_client
+            .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
+    }
+
    pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
        self.gc_compaction_state.load_full().as_ref().clone()
    }
@@ -3560,12 +3609,16 @@ impl Timeline {
        Ok(layer)
    }

-    pub(super) fn is_previous_heatmap_active(&self) -> bool {
-        self.previous_heatmap
-            .load()
-            .as_ref()
-            .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. }))
-            .unwrap_or(false)
+    pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool {
+        let crnt = self.previous_heatmap.load();
+        match crnt.as_deref() {
+            Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn {
+                Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn,
+                None => true,
+            },
+            Some(PreviousHeatmap::Obsolete) => false,
+            None => false,
+        }
    }

    /// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -3593,26 +3646,26 @@ impl Timeline {
        // heatamp.
        let previous_heatmap = self.previous_heatmap.load();
        let visible_non_resident = match previous_heatmap.as_deref() {
-            Some(PreviousHeatmap::Active { heatmap, read_at }) => {
-                Some(heatmap.layers.iter().filter_map(|hl| {
-                    let desc: PersistentLayerDesc = hl.name.clone().into();
-                    let layer = guard.try_get_from_key(&desc.key())?;
+            Some(PreviousHeatmap::Active {
+                heatmap, read_at, ..
+            }) => Some(heatmap.all_layers().filter_map(|hl| {
+                let desc: PersistentLayerDesc = hl.name.clone().into();
+                let layer = guard.try_get_from_key(&desc.key())?;

-                    if layer.visibility() == LayerVisibilityHint::Covered {
-                        return None;
-                    }
+                if layer.visibility() == LayerVisibilityHint::Covered {
+                    return None;
+                }

-                    if layer.is_likely_resident() {
-                        return None;
-                    }
+                if layer.is_likely_resident() {
+                    return None;
+                }

-                    if layer.last_evicted_at().happened_after(*read_at) {
-                        return None;
-                    }
+                if layer.last_evicted_at().happened_after(*read_at) {
+                    return None;
+                }

-                    Some((desc, hl.metadata.clone(), hl.access_time))
-                }))
-            }
+                Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
+            })),
            Some(PreviousHeatmap::Obsolete) => None,
            None => None,
        };
@@ -3627,6 +3680,7 @@ impl Timeline {
                        layer.layer_desc().clone(),
                        layer.metadata(),
                        last_activity_ts,
+                        false, // these layers are not cold
                    ))
                }
                LayerVisibilityHint::Covered => {
@@ -3653,12 +3707,14 @@ impl Timeline {
        // Sort layers in order of which to download first.  For a large set of layers to download, we
        // want to prioritize those layers which are most likely to still be in the resident many minutes
        // or hours later:
+        // - Cold layers go last for convenience when a human inspects the heatmap.
        // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
        //   only exist for a few minutes before being compacted into L1s.
        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
        //   the layer is likely to be covered by an image layer during compaction.
-        layers.sort_by_key(|(desc, _meta, _atime)| {
+        layers.sort_by_key(|(desc, _meta, _atime, cold)| {
            std::cmp::Reverse((
+                *cold,
                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
                desc.lsn_range.end,
            ))
@@ -3666,7 +3722,9 @@ impl Timeline {

        let layers = layers
            .into_iter()
-            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
+            .map(|(desc, meta, atime, cold)| {
+                HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
+            })
            .collect();

        Some(HeatMapTimeline::new(self.timeline_id, layers))
@@ -3686,6 +3744,7 @@ impl Timeline {
                name: vl.layer_desc().layer_name(),
                metadata: vl.metadata(),
                access_time: now,
+                cold: true,
            };
            heatmap_layers.push(hl);
        }
@@ -3699,6 +3758,7 @@ impl Timeline {
        PreviousHeatmap::Active {
            heatmap,
            read_at: Instant::now(),
+            end_lsn: Some(end_lsn),
        }
    }

@@ -3897,39 +3957,22 @@ impl Timeline {
                let guard = timeline.layers.read().await;
                let layers = guard.layer_map()?;

-                let in_memory_layer = layers.find_in_memory_layer(|l| {
-                    let start_lsn = l.get_lsn_range().start;
-                    cont_lsn > start_lsn
-                });
+                for range in unmapped_keyspace.ranges.iter() {
+                    let results = layers.range_search(range.clone(), cont_lsn);

-                match in_memory_layer {
-                    Some(l) => {
-                        let lsn_range = l.get_lsn_range().start..cont_lsn;
-                        fringe.update(
-                            ReadableLayer::InMemoryLayer(l),
-                            unmapped_keyspace.clone(),
-                            lsn_range,
-                        );
-                    }
-                    None => {
-                        for range in unmapped_keyspace.ranges.iter() {
-                            let results = layers.range_search(range.clone(), cont_lsn);
-
-                            results
-                                .found
-                                .into_iter()
-                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                    (
-                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                        keyspace_accum.to_keyspace(),
-                                        lsn_floor..cont_lsn,
-                                    )
-                                })
-                                .for_each(|(layer, keyspace, lsn_range)| {
-                                    fringe.update(layer, keyspace, lsn_range)
-                                });
-                        }
-                    }
+                    results
+                        .found
+                        .into_iter()
+                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                            (
+                                guard.upgrade(layer),
+                                keyspace_accum.to_keyspace(),
+                                lsn_floor..cont_lsn,
+                            )
+                        })
+                        .for_each(|(layer, keyspace, lsn_range)| {
+                            fringe.update(layer, keyspace, lsn_range)
+                        });
                }

                // It's safe to drop the layer map lock after planning the next round of reads.
@@ -4202,10 +4245,6 @@ impl Timeline {

                // Stall flushes to backpressure if compaction can't keep up. This is propagated up
                // to WAL ingestion by having ephemeral layer rolls wait for flushes.
-                //
-                // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so
-                // we can end up stalling before compaction even starts. Consider making it more
-                // responsive (e.g. via `watch_level0_deltas`).
                if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() {
                    if l0_count >= stall_threshold {
                        warn!(
@@ -4693,10 +4732,7 @@ impl Timeline {
            ));
        }

-        let (dense_ks, sparse_ks) = self
-            .collect_keyspace(lsn, ctx)
-            .await
-            .map_err(CompactionError::CollectKeySpaceError)?;
+        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
        let sparse_partitioning = SparseKeyPartitioning {
            parts: vec![sparse_ks],
@@ -5423,13 +5459,42 @@ pub(crate) enum CompactionError {
    Offload(OffloadError),
    /// Compaction cannot be done right now; page reconstruction and so on.
    #[error("Failed to collect keyspace: {0}")]
-    CollectKeySpaceError(CollectKeySpaceError),
+    CollectKeySpaceError(#[from] CollectKeySpaceError),
    #[error(transparent)]
    Other(anyhow::Error),
    #[error("Compaction already running: {0}")]
    AlreadyRunning(&'static str),
 }

+impl CompactionError {
+    /// Errors that can be ignored, i.e., cancel and shutdown.
+    pub fn is_cancel(&self) -> bool {
+        matches!(
+            self,
+            Self::ShuttingDown
+                | Self::AlreadyRunning(_)
+                | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
+                | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
+                    PageReconstructError::Cancelled
+                ))
+                | Self::Offload(OffloadError::Cancelled)
+        )
+    }
+
+    /// Critical errors that indicate data corruption.
+    pub fn is_critical(&self) -> bool {
+        matches!(
+            self,
+            Self::CollectKeySpaceError(
+                CollectKeySpaceError::Decode(_)
+                    | CollectKeySpaceError::PageRead(
+                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
+                    )
+            )
+        )
+    }
+}
+
 impl From<OffloadError> for CompactionError {
    fn from(e: OffloadError) -> Self {
        match e {
@@ -5439,18 +5504,6 @@ impl From<OffloadError> for CompactionError {
    }
 }

-impl From<CollectKeySpaceError> for CompactionError {
-    fn from(err: CollectKeySpaceError) -> Self {
-        match err {
-            CollectKeySpaceError::Cancelled
-            | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => {
-                CompactionError::ShuttingDown
-            }
-            e => CompactionError::Other(e.into()),
-        }
-    }
-}
-
 impl From<super::upload_queue::NotInitialized> for CompactionError {
    fn from(value: super::upload_queue::NotInitialized) -> Self {
        match value {
@@ -5534,6 +5587,14 @@ pub struct DeltaLayerTestDesc {
    pub data: Vec<(Key, Lsn, Value)>,
 }

+#[cfg(test)]
+#[derive(Clone)]
+pub struct InMemoryLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub data: Vec<(Key, Lsn, Value)>,
+    pub is_open: bool,
+}
+
 #[cfg(test)]
 impl DeltaLayerTestDesc {
    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
@@ -6193,6 +6254,7 @@ impl Timeline {
    pub(crate) async fn spawn_download_all_remote_layers(
        self: Arc<Self>,
        request: DownloadRemoteLayersTaskSpawnRequest,
+        ctx: &RequestContext,
    ) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
        use pageserver_api::models::DownloadRemoteLayersTaskState;

@@ -6213,6 +6275,10 @@ impl Timeline {
        }

        let self_clone = Arc::clone(&self);
+        let task_ctx = ctx.detached_child(
+            TaskKind::DownloadAllRemoteLayers,
+            DownloadBehavior::Download,
+        );
        let task_id = task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
@@ -6220,7 +6286,7 @@ impl Timeline {
            Some(self.timeline_id),
            "download all remote layers task",
            async move {
-                self_clone.download_all_remote_layers(request).await;
+                self_clone.download_all_remote_layers(request, &task_ctx).await;
                let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
                 match &mut *status_guard {
                    None => {
@@ -6255,6 +6321,7 @@ impl Timeline {
    async fn download_all_remote_layers(
        self: &Arc<Self>,
        request: DownloadRemoteLayersTaskSpawnRequest,
+        ctx: &RequestContext,
    ) {
        use pageserver_api::models::DownloadRemoteLayersTaskState;

@@ -6311,9 +6378,10 @@ impl Timeline {

                let span = tracing::info_span!("download", layer = %next);

+                let ctx = ctx.attached_child();
                js.spawn(
                    async move {
-                        let res = next.download().await;
+                        let res = next.download(&ctx).await;
                        (next, res)
                    }
                    .instrument(span),
@@ -6541,6 +6609,92 @@ impl Timeline {
        Ok(())
    }

+    /// Force create an in-memory layer and place them into the layer map.
+    #[cfg(test)]
+    pub(super) async fn force_create_in_memory_layer(
+        self: &Arc<Timeline>,
+        mut in_memory: InMemoryLayerTestDesc,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+
+        // Validate LSNs
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(in_memory.lsn_range.start >= check_start_lsn);
+        }
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let layer_end_lsn = if in_memory.is_open {
+            in_memory
+                .data
+                .iter()
+                .map(|(_key, lsn, _value)| lsn)
+                .max()
+                .cloned()
+        } else {
+            Some(in_memory.lsn_range.end)
+        };
+
+        if let Some(end) = layer_end_lsn {
+            assert!(
+                end <= last_record_lsn,
+                "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+                end,
+                last_record_lsn,
+            );
+        }
+
+        in_memory.data.iter().for_each(|(_key, lsn, _value)| {
+            assert!(*lsn >= in_memory.lsn_range.start);
+            assert!(*lsn < in_memory.lsn_range.end);
+        });
+
+        // Build the batch
+        in_memory
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+
+        let data = in_memory
+            .data
+            .into_iter()
+            .map(|(key, lsn, value)| {
+                let value_size = value.serialized_size().unwrap() as usize;
+                (key.to_compact(), lsn, value_size, value)
+            })
+            .collect::<Vec<_>>();
+
+        let batch = SerializedValueBatch::from_values(data);
+
+        // Create the in-memory layer and write the batch into it
+        let layer = InMemoryLayer::create(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            in_memory.lsn_range.start,
+            &self.gate,
+            ctx,
+        )
+        .await
+        .unwrap();
+
+        layer.put_batch(batch, ctx).await.unwrap();
+        if !in_memory.is_open {
+            layer.freeze(in_memory.lsn_range.end).await;
+        }
+
+        info!("force created in-memory layer {:?}", in_memory.lsn_range);
+
+        // Link the layer to the layer map
+        {
+            let mut guard = self.layers.write().await;
+            let layer_map = guard.open_mut().unwrap();
+            layer_map.force_insert_in_memory_layer(Arc::new(layer));
+        }
+
+        Ok(())
+    }
+
    /// Return all keys at the LSN in the image layers
    #[cfg(test)]
    pub(crate) async fn inspect_image_layers(
@@ -6900,11 +7054,13 @@ mod tests {

    use pageserver_api::key::Key;
    use pageserver_api::value::Value;
+    use std::iter::Iterator;
    use tracing::Instrument;
    use utils::id::TimelineId;
    use utils::lsn::Lsn;

    use super::HeatMapTimeline;
+    use crate::context::RequestContextBuilder;
    use crate::tenant::harness::{TenantHarness, test_img};
    use crate::tenant::layer_map::LayerMap;
    use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
@@ -6912,8 +7068,8 @@ mod tests {
    use crate::tenant::{PreviousHeatmap, Timeline};

    fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
-        assert_eq!(lhs.layers.len(), rhs.layers.len());
-        let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
+        assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
+        let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
        for (l, r) in lhs_rhs {
            assert_eq!(l.name, r.name);
            assert_eq!(l.metadata, r.metadata);
@@ -6972,12 +7128,14 @@ mod tests {
                Lsn(0x10),
                14,
                &ctx,
+                Vec::new(), // in-memory layers
                delta_layers,
                image_layers,
                Lsn(0x100),
            )
            .await
            .unwrap();
+        let ctx = &ctx.with_scope_timeline(&timeline);

        // Layer visibility is an input to heatmap generation, so refresh it first
        timeline.update_layer_visibility().await.unwrap();
@@ -6990,10 +7148,11 @@ mod tests {
        assert_eq!(heatmap.timeline_id, timeline.timeline_id);

        // L0 should come last
-        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
+        let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
+        assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());

        let mut last_lsn = Lsn::MAX;
-        for layer in &heatmap.layers {
+        for layer in heatmap_layers {
            // Covered layer should be omitted
            assert!(layer.name != covered_delta.layer_name());

@@ -7026,6 +7185,7 @@ mod tests {
            .store(Some(Arc::new(PreviousHeatmap::Active {
                heatmap: heatmap.clone(),
                read_at: std::time::Instant::now(),
+                end_lsn: None,
            })));

        // Generate a new heatmap and assert that it contains the same layers as the old one.
@@ -7041,8 +7201,12 @@ mod tests {

            eprintln!("Downloading {layer} and re-generating heatmap");

+            let ctx = &RequestContextBuilder::extend(ctx)
+                .download_behavior(crate::context::DownloadBehavior::Download)
+                .build();
+
            let _resident = layer
-                .download_and_keep_resident()
+                .download_and_keep_resident(ctx)
                .instrument(tracing::info_span!(
                    parent: None,
                    "download_layer",
@@ -7100,6 +7264,7 @@ mod tests {
                Lsn(0x10),
                14,
                &ctx,
+                Vec::new(), // in-memory layers
                delta_layers,
                image_layers,
                Lsn(0x100),
@@ -7116,7 +7281,7 @@ mod tests {
            .expect("Infallible while timeline is not shut down");

        // Both layers should be in the heatmap
-        assert!(!heatmap.layers.is_empty());
+        assert!(heatmap.all_layers().count() > 0);

        // Now simulate a migration.
        timeline
@@ -7124,6 +7289,7 @@ mod tests {
            .store(Some(Arc::new(PreviousHeatmap::Active {
                heatmap: heatmap.clone(),
                read_at: std::time::Instant::now(),
+                end_lsn: None,
            })));

        // Evict all the layers in the previous heatmap
@@ -7141,7 +7307,7 @@ mod tests {
            .await
            .expect("Infallible while timeline is not shut down");

-        assert!(post_eviction_heatmap.layers.is_empty());
+        assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
        assert!(matches!(
            timeline.previous_heatmap.load().as_deref(),
            Some(PreviousHeatmap::Obsolete)
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,11 +7,20 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
+use std::time::Instant;

-use anyhow::{Context, anyhow, bail};
+use super::layer_manager::LayerManager;
+use super::{
+    CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
+    GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
+    Timeline,
+};
+
+use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
@@ -31,15 +40,8 @@ use utils::critical;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

-use super::layer_manager::LayerManager;
-use super::{
-    CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
-    GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError,
-    RecordedDuration, Timeline,
-};
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::pgdatadir_mapping::CollectKeySpaceError;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -213,30 +215,39 @@ impl GcCompactionQueue {
    }

    /// Trigger an auto compaction.
-    pub async fn trigger_auto_compaction(&self, timeline: &Arc<Timeline>) {
+    pub async fn trigger_auto_compaction(
+        &self,
+        timeline: &Arc<Timeline>,
+    ) -> Result<(), CompactionError> {
        let GcCompactionCombinedSettings {
            gc_compaction_enabled,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
        } = timeline.get_gc_compaction_settings();
        if !gc_compaction_enabled {
-            return;
+            return Ok(());
        }
        if self.remaining_jobs_num() > 0 {
            // Only schedule auto compaction when the queue is empty
-            return;
+            return Ok(());
        }
        if timeline.ancestor_timeline().is_some() {
            // Do not trigger auto compaction for child timelines. We haven't tested
            // it enough in staging yet.
-            return;
+            return Ok(());
+        }
+        if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
+            // If the gc watermark is not set, we don't need to trigger auto compaction.
+            // This check is the same as in `gc_compaction_split_jobs` but we don't log
+            // here and we can also skip the computation of the trigger condition earlier.
+            return Ok(());
        }

        let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
            // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
            // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger`
            // to ensure the fairness while avoid starving other tasks.
-            return;
+            return Ok(());
        };

        let gc_compaction_state = timeline.get_gc_compaction_state();
@@ -246,7 +257,7 @@ impl GcCompactionQueue {

        let layers = {
            let guard = timeline.layers.read().await;
-            let layer_map = guard.layer_map().unwrap();
+            let layer_map = guard.layer_map()?;
            layer_map.iter_historic_layers().collect_vec()
        };
        let mut l2_size: u64 = 0;
@@ -318,11 +329,12 @@ impl GcCompactionQueue {
                l1_size, l2_size, l2_lsn, gc_cutoff
            );
        } else {
-            info!(
+            debug!(
                "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
                l1_size, l2_size, l2_lsn, gc_cutoff
            );
        }
+        Ok(())
    }

    /// Notify the caller the job has finished and unblock GC.
@@ -353,8 +365,7 @@ impl GcCompactionQueue {
                GcCompactJob::from_compact_options(options.clone()),
                options.sub_compaction_max_job_size_mb,
            )
-            .await
-            .map_err(CompactionError::Other)?;
+            .await?;
        if jobs.is_empty() {
            info!("no jobs to run, skipping scheduled compaction task");
            self.notify_and_unblock(id);
@@ -433,6 +444,7 @@ impl GcCompactionQueue {
            ));
        };
        let has_pending_tasks;
+        let mut yield_for_l0 = false;
        let Some((id, item)) = ({
            let mut guard = self.inner.lock().unwrap();
            if let Some((id, item)) = guard.queued.pop_front() {
@@ -444,7 +456,7 @@ impl GcCompactionQueue {
                None
            }
        }) else {
-            self.trigger_auto_compaction(timeline).await;
+            self.trigger_auto_compaction(timeline).await?;
            // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we
            // have not implemented preemption mechanism yet. We always want to yield it to more important
            // tasks if there is one.
@@ -482,13 +494,23 @@ impl GcCompactionQueue {
                        let mut guard = self.inner.lock().unwrap();
                        guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                    }
-                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                    let compaction_result =
+                        timeline.compact_with_options(cancel, options, ctx).await?;
                    self.notify_and_unblock(id);
+                    if compaction_result == CompactionOutcome::YieldForL0 {
+                        yield_for_l0 = true;
+                    }
                }
            }
            GcCompactionQueueItem::SubCompactionJob(options) => {
                // TODO: error handling, clear the queue if any task fails?
-                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
+                if compaction_result == CompactionOutcome::YieldForL0 {
+                    // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
+                    // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
+                    // we need to clean things up before returning from the function.
+                    yield_for_l0 = true;
+                }
            }
            GcCompactionQueueItem::Notify(id, l2_lsn) => {
                self.notify_and_unblock(id);
@@ -517,7 +539,10 @@ impl GcCompactionQueue {
            let mut guard = self.inner.lock().unwrap();
            guard.running = None;
        }
-        Ok(if has_pending_tasks {
+        Ok(if yield_for_l0 {
+            tracing::info!("give up gc-compaction: yield for L0 compaction");
+            CompactionOutcome::YieldForL0
+        } else if has_pending_tasks {
            CompactionOutcome::Pending
        } else {
            CompactionOutcome::Done
@@ -716,17 +741,41 @@ struct CompactionStatisticsNumSize {

 #[derive(Debug, Serialize, Default)]
 pub struct CompactionStatistics {
+    /// Delta layer visited (maybe compressed, physical size)
    delta_layer_visited: CompactionStatisticsNumSize,
+    /// Image layer visited (maybe compressed, physical size)
    image_layer_visited: CompactionStatisticsNumSize,
+    /// Delta layer produced (maybe compressed, physical size)
    delta_layer_produced: CompactionStatisticsNumSize,
+    /// Image layer produced (maybe compressed, physical size)
    image_layer_produced: CompactionStatisticsNumSize,
-    num_delta_layer_discarded: usize,
-    num_image_layer_discarded: usize,
+    /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    delta_layer_discarded: CompactionStatisticsNumSize,
+    /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    image_layer_discarded: CompactionStatisticsNumSize,
    num_unique_keys_visited: usize,
+    /// Delta visited (uncompressed, original size)
    wal_keys_visited: CompactionStatisticsNumSize,
+    /// Image visited (uncompressed, original size)
    image_keys_visited: CompactionStatisticsNumSize,
+    /// Delta produced (uncompressed, original size)
    wal_produced: CompactionStatisticsNumSize,
+    /// Image produced (uncompressed, original size)
    image_produced: CompactionStatisticsNumSize,
+
+    // Time spent in each phase
+    time_acquire_lock_secs: f64,
+    time_analyze_secs: f64,
+    time_download_layer_secs: f64,
+    time_main_loop_secs: f64,
+    time_final_phase_secs: f64,
+    time_total_secs: f64,
+
+    // Summary
+    /// Ratio of the key-value size before/after gc-compaction.
+    uncompressed_size_ratio: f64,
+    /// Ratio of the physical size before/after gc-compaction.
+    physical_size_ratio: f64,
 }

 impl CompactionStatistics {
@@ -776,11 +825,13 @@ impl CompactionStatistics {
        self.image_produced.num += 1;
        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
    }
-    fn discard_delta_layer(&mut self) {
-        self.num_delta_layer_discarded += 1;
+    fn discard_delta_layer(&mut self, original_size: u64) {
+        self.delta_layer_discarded.num += 1;
+        self.delta_layer_discarded.size += original_size;
    }
-    fn discard_image_layer(&mut self) {
-        self.num_image_layer_discarded += 1;
+    fn discard_image_layer(&mut self, original_size: u64) {
+        self.image_layer_discarded.num += 1;
+        self.image_layer_discarded.size += original_size;
    }
    fn produce_delta_layer(&mut self, size: u64) {
        self.delta_layer_produced.num += 1;
@@ -790,6 +841,19 @@ impl CompactionStatistics {
        self.image_layer_produced.num += 1;
        self.image_layer_produced.size += size;
    }
+    fn finalize(&mut self) {
+        let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
+        let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
+        self.uncompressed_size_ratio =
+            original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
+        let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
+        let produced_physical_size = self.image_layer_produced.size
+            + self.delta_layer_produced.size
+            + self.image_layer_discarded.size
+            + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
+        self.physical_size_ratio =
+            original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
+    }
 }

 #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
@@ -822,9 +886,7 @@ impl Timeline {
            .flags
            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
        {
-            self.compact_with_gc(cancel, options, ctx)
-                .await
-                .map_err(CompactionError::Other)?;
+            self.compact_with_gc(cancel, options, ctx).await?;
            return Ok(CompactionOutcome::Done);
        }

@@ -976,18 +1038,12 @@ impl Timeline {

            // Suppress errors when cancelled.
            Err(_) if self.cancel.is_cancelled() => {}
-            Err(CompactionError::ShuttingDown) => {}
-            Err(CompactionError::CollectKeySpaceError(CollectKeySpaceError::Cancelled)) => {}
+            Err(err) if err.is_cancel() => {}

            // Alert on critical errors that indicate data corruption.
-            Err(
-                err @ CompactionError::CollectKeySpaceError(
-                    CollectKeySpaceError::Decode(_)
-                    | CollectKeySpaceError::PageRead(
-                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
-                    ),
-                ),
-            ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"),
+            Err(err) if err.is_critical() => {
+                critical!("could not compact, repartitioning keyspace failed: {err:?}");
+            }

            // Log other errors. No partitioning? This is normal, if the timeline was just created
            // as an empty timeline. Also in unit tests, when we use the timeline as a simple
@@ -1161,7 +1217,7 @@ impl Timeline {
            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
            //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer.download_and_keep_resident().await?;
+            let resident = layer.download_and_keep_resident(ctx).await?;

            let keys_written = resident
                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
@@ -1389,14 +1445,14 @@ impl Timeline {

        let mut fully_compacted = true;

-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?);
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;

            if lsn_range.start != prev_lsn_end {
                break;
            }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact.push(l.download_and_keep_resident(ctx).await?);
            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;

@@ -2350,12 +2406,19 @@ impl Timeline {
    async fn check_compaction_space(
        self: &Arc<Self>,
        layer_selection: &[Layer],
-    ) -> anyhow::Result<()> {
-        let available_space = self.check_available_space().await?;
+    ) -> Result<(), CompactionError> {
+        let available_space = self
+            .check_available_space()
+            .await
+            .map_err(CompactionError::Other)?;
        let mut remote_layer_size = 0;
        let mut all_layer_size = 0;
        for layer in layer_selection {
-            let needs_download = layer.needs_download().await?;
+            let needs_download = layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?;
            if needs_download.is_some() {
                remote_layer_size += layer.layer_desc().file_size;
            }
@@ -2364,14 +2427,14 @@ impl Timeline {
        let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
        if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
        {
-            return Err(anyhow!(
+            return Err(CompactionError::Other(anyhow!(
                "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
                available_space,
                allocated_space,
                all_layer_size,
                remote_layer_size,
                all_layer_size + remote_layer_size
-            ));
+            )));
        }
        Ok(())
    }
@@ -2402,7 +2465,7 @@ impl Timeline {
        self: &Arc<Self>,
        job: GcCompactJob,
        sub_compaction_max_job_size_mb: Option<u64>,
-    ) -> anyhow::Result<Vec<GcCompactJob>> {
+    ) -> Result<Vec<GcCompactJob>, CompactionError> {
        let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
            job.compact_lsn_range.end
        } else {
@@ -2553,7 +2616,7 @@ impl Timeline {
        cancel: &CancellationToken,
        options: CompactOptions,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        let sub_compaction = options.sub_compaction;
        let job = GcCompactJob::from_compact_options(options.clone());
        if sub_compaction {
@@ -2575,7 +2638,7 @@ impl Timeline {
            if jobs_len == 0 {
                info!("no jobs to run, skipping gc bottom-most compaction");
            }
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
        }
        self.compact_with_gc_inner(cancel, job, ctx).await
    }
@@ -2585,19 +2648,24 @@ impl Timeline {
        cancel: &CancellationToken,
        job: GcCompactJob,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
        // Note that we already acquired the compaction lock when the outer `compact` function gets called.

+        let timer = Instant::now();
+        let begin_timer = timer;
+
        let gc_lock = async {
            tokio::select! {
                guard = self.gc_lock.lock() => Ok(guard),
-                // TODO: refactor to CompactionError to correctly pass cancelled error
-                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
+                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
            }
        };

+        let time_acquire_lock = timer.elapsed();
+        let timer = Instant::now();
+
        let gc_lock = crate::timed(
            gc_lock,
            "acquires gc lock",
@@ -2649,7 +2717,7 @@ impl Timeline {
                        tracing::warn!(
                            "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
                        );
-                        return Ok(());
+                        return Ok(CompactionOutcome::Skipped);
                    }
                    real_gc_cutoff
                } else {
@@ -2687,7 +2755,7 @@ impl Timeline {
                    "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
                    gc_cutoff
                );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
            };
            // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
            // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
@@ -2708,7 +2776,7 @@ impl Timeline {
                    "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
                    compact_lsn_range.end
                );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
            // layers to compact.
@@ -2734,7 +2802,7 @@ impl Timeline {
                    "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
                    gc_cutoff, compact_key_range.start, compact_key_range.end
                );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
            }
            retain_lsns_below_horizon.sort();
            GcCompactionJobDescription {
@@ -2787,6 +2855,9 @@ impl Timeline {
            has_data_below,
        );

+        let time_analyze = timer.elapsed();
+        let timer = Instant::now();
+
        for layer in &job_desc.selected_layers {
            debug!("read layer: {}", layer.layer_desc().key());
        }
@@ -2815,10 +2886,10 @@ impl Timeline {
            .map(|layer| layer.layer_desc().layer_name())
            .collect_vec();
        if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
                err
-            );
+            )));
        }
        // The maximum LSN we are processing in this compaction loop
        let end_lsn = job_desc
@@ -2833,11 +2904,33 @@ impl Timeline {
        let mut total_downloaded_size = 0;
        let mut total_layer_size = 0;
        for layer in &job_desc.selected_layers {
-            if layer.needs_download().await?.is_some() {
+            if layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?
+                .is_some()
+            {
                total_downloaded_size += layer.layer_desc().file_size;
            }
            total_layer_size += layer.layer_desc().file_size;
-            let resident_layer = layer.download_and_keep_resident().await?;
+            if cancel.is_cancelled() {
+                return Err(CompactionError::ShuttingDown);
+            }
+            let should_yield = self
+                .l0_compaction_trigger
+                .notified()
+                .now_or_never()
+                .is_some();
+            if should_yield {
+                tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
+            }
+            let resident_layer = layer
+                .download_and_keep_resident(ctx)
+                .await
+                .context("failed to download and keep resident layer")
+                .map_err(CompactionError::Other)?;
            downloaded_layers.push(resident_layer);
        }
        info!(
@@ -2848,19 +2941,36 @@ impl Timeline {
        );
        for resident_layer in &downloaded_layers {
            if resident_layer.layer_desc().is_delta() {
-                let layer = resident_layer.get_as_delta(ctx).await?;
+                let layer = resident_layer
+                    .get_as_delta(ctx)
+                    .await
+                    .context("failed to get delta layer")
+                    .map_err(CompactionError::Other)?;
                delta_layers.push(layer);
            } else {
-                let layer = resident_layer.get_as_image(ctx).await?;
+                let layer = resident_layer
+                    .get_as_image(ctx)
+                    .await
+                    .context("failed to get image layer")
+                    .map_err(CompactionError::Other)?;
                image_layers.push(layer);
            }
        }
-        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_gc_compaction_keyspace()
+            .await
+            .context("failed to collect gc compaction keyspace")
+            .map_err(CompactionError::Other)?;
        let mut merge_iter = FilterIterator::create(
            MergeIterator::create(&delta_layers, &image_layers, ctx),
            dense_ks,
            sparse_ks,
-        )?;
+        )
+        .context("failed to create filter iterator")
+        .map_err(CompactionError::Other)?;
+
+        let time_download_layer = timer.elapsed();
+        let timer = Instant::now();

        // Step 2: Produce images+deltas.
        let mut accumulated_values = Vec::new();
@@ -2880,7 +2990,9 @@ impl Timeline {
                    &self.gate,
                    ctx,
                )
-                .await?,
+                .await
+                .context("failed to create image layer writer")
+                .map_err(CompactionError::Other)?,
            )
        } else {
            None
@@ -2893,7 +3005,9 @@ impl Timeline {
            lowest_retain_lsn..end_lsn,
            self.get_compaction_target_size(),
        )
-        .await?;
+        .await
+        .context("failed to create delta layer writer")
+        .map_err(CompactionError::Other)?;

        #[derive(Default)]
        struct RewritingLayers {
@@ -2933,9 +3047,28 @@ impl Timeline {
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

-        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
+        let mut keys_processed = 0;
+
+        while let Some(((key, lsn, val), desc)) = merge_iter
+            .next_with_trace()
+            .await
+            .context("failed to get next key-value pair")
+            .map_err(CompactionError::Other)?
+        {
            if cancel.is_cancelled() {
-                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+                return Err(CompactionError::ShuttingDown);
+            }
+            keys_processed += 1;
+            if keys_processed % 1000 == 0 {
+                let should_yield = self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some();
+                if should_yield {
+                    tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
+                    return Ok(CompactionOutcome::YieldForL0);
+                }
            }
            if self.shard_identity.is_key_disposable(&key) {
                // If this shard does not need to store this key, simply skip it.
@@ -2967,7 +3100,9 @@ impl Timeline {
                                &self.gate,
                                ctx,
                            )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                        );
                    }
                    rewriter.before.as_mut().unwrap()
@@ -2983,14 +3118,20 @@ impl Timeline {
                                &self.gate,
                                ctx,
                            )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                        );
                    }
                    rewriter.after.as_mut().unwrap()
                } else {
                    unreachable!()
                };
-                rewriter.put_value(key, lsn, val, ctx).await?;
+                rewriter
+                    .put_value(key, lsn, val, ctx)
+                    .await
+                    .context("failed to put value")
+                    .map_err(CompactionError::Other)?;
                continue;
            }
            match val {
@@ -3013,9 +3154,13 @@ impl Timeline {
                        &job_desc.retain_lsns_below_horizon,
                        COMPACTION_DELTA_THRESHOLD,
                        get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
-                            .await?,
+                            .await
+                            .context("failed to get ancestor image")
+                            .map_err(CompactionError::Other)?,
                    )
-                    .await?;
+                    .await
+                    .context("failed to generate key retention")
+                    .map_err(CompactionError::Other)?;
                retention
                    .pipe_to(
                        *last_key,
@@ -3025,7 +3170,9 @@ impl Timeline {
                        &self.gate,
                        ctx,
                    )
-                    .await?;
+                    .await
+                    .context("failed to pipe to delta layer writer")
+                    .map_err(CompactionError::Other)?;
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
@@ -3043,9 +3190,14 @@ impl Timeline {
                job_desc.gc_cutoff,
                &job_desc.retain_lsns_below_horizon,
                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
+                    .await
+                    .context("failed to get ancestor image")
+                    .map_err(CompactionError::Other)?,
            )
-            .await?;
+            .await
+            .context("failed to generate key retention")
+            .map_err(CompactionError::Other)?;
        retention
            .pipe_to(
                last_key,
@@ -3055,21 +3207,36 @@ impl Timeline {
                &self.gate,
                ctx,
            )
-            .await?;
+            .await
+            .context("failed to pipe to delta layer writer")
+            .map_err(CompactionError::Other)?;
        // end: move the above part to the loop body

+        let time_main_loop = timer.elapsed();
+        let timer = Instant::now();
+
        let mut rewrote_delta_layers = Vec::new();
        for (key, writers) in delta_layer_rewriters {
            if let Some(delta_writer_before) = writers.before {
                let (desc, path) = delta_writer_before
                    .finish(job_desc.compaction_key_range.start, ctx)
-                    .await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                rewrote_delta_layers.push(layer);
            }
            if let Some(delta_writer_after) = writers.after {
-                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                let (desc, path) = delta_writer_after
+                    .finish(key.key_range.end, ctx)
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                rewrote_delta_layers.push(layer);
            }
        }
@@ -3084,7 +3251,9 @@ impl Timeline {
                let end_key = job_desc.compaction_key_range.end;
                writer
                    .finish_with_discard_fn(self, ctx, end_key, discard)
-                    .await?
+                    .await
+                    .context("failed to finish image layer writer")
+                    .map_err(CompactionError::Other)?
            } else {
                drop(writer);
                Vec::new()
@@ -3096,7 +3265,9 @@ impl Timeline {
        let produced_delta_layers = if !dry_run {
            delta_layer_writer
                .finish_with_discard_fn(self, ctx, discard)
-                .await?
+                .await
+                .context("failed to finish delta layer writer")
+                .map_err(CompactionError::Other)?
        } else {
            drop(delta_layer_writer);
            Vec::new()
@@ -3108,6 +3279,13 @@ impl Timeline {
        let mut keep_layers = HashSet::new();
        let produced_delta_layers_len = produced_delta_layers.len();
        let produced_image_layers_len = produced_image_layers.len();
+
+        let layer_selection_by_key = job_desc
+            .selected_layers
+            .iter()
+            .map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
+            .collect::<HashMap<_, _>>();
+
        for action in produced_delta_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
@@ -3121,8 +3299,16 @@ impl Timeline {
                    if cfg!(debug_assertions) {
                        info!("discarded delta layer: {}", l);
                    }
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_delta_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_delta_layer(0);
+                    }
                    keep_layers.insert(l);
-                    stat.discard_delta_layer();
                }
            }
        }
@@ -3131,6 +3317,9 @@ impl Timeline {
                "produced rewritten delta layer: {}",
                layer.layer_desc().key()
            );
+            // For now, we include rewritten delta layer size in the "produce_delta_layer". We could
+            // make it a separate statistics in the future.
+            stat.produce_delta_layer(layer.layer_desc().file_size());
        }
        compact_to.extend(rewrote_delta_layers);
        for action in produced_image_layers {
@@ -3142,8 +3331,16 @@ impl Timeline {
                }
                BatchWriterResult::Discarded(l) => {
                    debug!("discarded image layer: {}", l);
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_image_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_image_layer(0);
+                    }
                    keep_layers.insert(l);
-                    stat.discard_image_layer();
                }
            }
        }
@@ -3176,7 +3373,9 @@ impl Timeline {
                    &layer.layer_desc().key_range,
                    &job_desc.compaction_key_range,
                ) {
-                    bail!("violated constraint: image layer outside of compaction key range");
+                    return Err(CompactionError::Other(anyhow!(
+                        "violated constraint: image layer outside of compaction key range"
+                    )));
                }
                if !fully_contains(
                    &job_desc.compaction_key_range,
@@ -3189,13 +3388,25 @@ impl Timeline {

        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));

+        let time_final_phase = timer.elapsed();
+
+        stat.time_final_phase_secs = time_final_phase.as_secs_f64();
+        stat.time_main_loop_secs = time_main_loop.as_secs_f64();
+        stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
+        stat.time_download_layer_secs = time_download_layer.as_secs_f64();
+        stat.time_analyze_secs = time_analyze.as_secs_f64();
+        stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
+        stat.finalize();
+
        info!(
            "gc-compaction statistics: {}",
-            serde_json::to_string(&stat)?
+            serde_json::to_string(&stat)
+                .context("failed to serialize gc-compaction statistics")
+                .map_err(CompactionError::Other)?
        );

        if dry_run {
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
        }

        info!(
@@ -3230,10 +3441,10 @@ impl Timeline {
        // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
        // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
        if let Some(err) = check_valid_layermap(&final_layers) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
                err
-            );
+            )));
        }

        // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
@@ -3285,7 +3496,9 @@ impl Timeline {
        // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
        // be batched into `schedule_compaction_update`.
        let disk_consistent_lsn = self.disk_consistent_lsn.load();
-        self.schedule_uploads(disk_consistent_lsn, None)?;
+        self.schedule_uploads(disk_consistent_lsn, None)
+            .context("failed to schedule uploads")
+            .map_err(CompactionError::Other)?;
        // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
        // of `compact_from`.
        let compact_from = {
@@ -3312,7 +3525,7 @@ impl Timeline {

        drop(gc_lock);

-        Ok(())
+        Ok(CompactionOutcome::Done)
    }
 }

@@ -3418,6 +3631,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
    async fn downcast_delta_layer(
        &self,
        layer: &OwnArc<PersistentLayerDesc>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
        // this is a lot more complex than a simple downcast...
        if layer.is_delta() {
@@ -3425,7 +3639,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
                let guard = self.timeline.layers.read().await;
                guard.get_from_desc(layer)
            };
-            let result = l.download_and_keep_resident().await?;
+            let result = l.download_and_keep_resident(ctx).await?;

            Ok(Some(ResidentDeltaLayer(result)))
        } else {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -11,6 +11,7 @@ use utils::id::TimelineId;
 use utils::{crashsafe, fs_ext, pausable_failpoint};

 use crate::config::PageServerConf;
+use crate::context::RequestContext;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::{
@@ -291,10 +292,11 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
+        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
-        let timeline = tenant
+        let (timeline, _timeline_ctx) = tenant
            .create_timeline_struct(
                timeline_id,
                local_metadata,
@@ -306,6 +308,8 @@ impl DeleteTimelineFlow {
                CreateTimelineCause::Delete,
                crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
                None, // doesn't matter what we put here
+                None, // doesn't matter what we put here
+                ctx,
            )
            .context("create_timeline_struct")?;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,6 +12,7 @@ use utils::completion;
 use utils::generation::Generation;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
+use utils::sync::gate::GateError;

 use super::layer_manager::LayerManager;
 use super::{FlushLayerError, Timeline};
@@ -363,14 +364,25 @@ pub(super) async fn prepare(

    let mut tasks = tokio::task::JoinSet::new();
    let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
+    let cancel_eval = CancellationToken::new();

    for adopted in rest_of_historic {
        let limiter = limiter.clone();
        let timeline = detached.clone();
+        let cancel_eval = cancel_eval.clone();

        tasks.spawn(
            async move {
-                let _permit = limiter.acquire().await;
+                let _permit = tokio::select! {
+                    permit = limiter.acquire() => {
+                        permit
+                    }
+                    // Wait for the cancellation here instead of letting the entire task be cancelled.
+                    // Cancellations are racy in that they might leave layers on disk.
+                    _ = cancel_eval.cancelled() => {
+                        Err(Error::ShuttingDown)?
+                    }
+                };
                let (owned, did_hardlink) = remote_copy(
                    &adopted,
                    &timeline,
@@ -386,7 +398,22 @@ pub(super) async fn prepare(
        );
    }

+    fn delete_layers(timeline: &Timeline, layers: Vec<Layer>) -> Result<(), Error> {
+        // We are deleting layers, so we must hold the gate
+        let _gate = timeline.gate.enter().map_err(|e| match e {
+            GateError::GateClosed => Error::ShuttingDown,
+        })?;
+        {
+            layers.into_iter().for_each(|l: Layer| {
+                l.delete_on_drop();
+                std::mem::drop(l);
+            });
+        }
+        Ok(())
+    }
+
    let mut should_fsync = false;
+    let mut first_err = None;
    while let Some(res) = tasks.join_next().await {
        match res {
            Ok(Ok((owned, did_hardlink))) => {
@@ -395,13 +422,24 @@ pub(super) async fn prepare(
                }
                new_layers.push(owned);
            }
+
+            // Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete.
            Ok(Err(failed)) => {
-                return Err(failed);
+                cancel_eval.cancel();
+                first_err.get_or_insert(failed);
+            }
+            Err(je) => {
+                cancel_eval.cancel();
+                first_err.get_or_insert(Error::Prepare(je.into()));
            }
-            Err(je) => return Err(Error::Prepare(je.into())),
        }
    }

+    if let Some(failed) = first_err {
+        delete_layers(detached, new_layers)?;
+        return Err(failed);
+    }
+
    // fsync directory again if we hardlinked something
    if should_fsync {
        fsync_timeline_dir(detached, ctx).await;
@@ -592,7 +630,7 @@ async fn copy_lsn_prefix(
    .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}"))
    .map_err(Error::Prepare)?;

-    let resident = layer.download_and_keep_resident().await.map_err(|e| {
+    let resident = layer.download_and_keep_resident(ctx).await.map_err(|e| {
        if e.is_cancelled() {
            Error::ShuttingDown
        } else {
@@ -650,6 +688,11 @@ async fn remote_copy(
    let conf = adoptee.conf;
    let file_name = adopted.layer_desc().layer_name();

+    // We don't want to shut the timeline down during this operation because we do `delete_on_drop` below
+    let _gate = adoptee.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => Error::ShuttingDown,
+    })?;
+
    // depending if Layer::keep_resident, do a hardlink
    let did_hardlink;
    let owned = if let Some(adopted_resident) = adopted.keep_resident().await {
@@ -661,8 +704,32 @@ async fn remote_copy(
            &file_name,
            &metadata.generation,
        );
-        std::fs::hard_link(adopted_path, &adoptee_path)
-            .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
+
+        match std::fs::hard_link(adopted_path, &adoptee_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+                // In theory we should not get into this situation as we are doing cleanups of the layer file after errors.
+                // However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch.
+
+                // Double check that the file is orphan (probably from an earlier attempt), then delete it
+                let key = file_name.clone().into();
+                if adoptee.layers.read().await.contains_key(&key) {
+                    // We are supposed to filter out such cases before coming to this function
+                    return Err(Error::Prepare(anyhow::anyhow!(
+                        "layer file {file_name} already present and inside layer map"
+                    )));
+                }
+                tracing::info!("Deleting orphan layer file to make way for hard linking");
+                // Delete orphan layer file and try again, to ensure this layer has a well understood source
+                std::fs::remove_file(adopted_path)
+                    .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
+                std::fs::hard_link(adopted_path, &adoptee_path)
+                    .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
+            }
+            Err(e) => {
+                return Err(Error::launder(e.into(), Error::Prepare));
+            }
+        };
        did_hardlink = true;
        Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard()
    } else {
@@ -670,12 +737,21 @@ async fn remote_copy(
        Layer::for_evicted(conf, adoptee, file_name, metadata)
    };

-    let layer = adoptee
+    let layer = match adoptee
        .remote_client
        .copy_timeline_layer(adopted, &owned, cancel)
        .await
-        .map(move |()| owned)
-        .map_err(|e| Error::launder(e, Error::Prepare))?;
+    {
+        Ok(()) => owned,
+        Err(e) => {
+            {
+                // Clean up the layer so that on a retry we don't get errors that the file already exists
+                owned.delete_on_drop();
+                std::mem::drop(owned);
+            }
+            return Err(Error::launder(e, Error::Prepare));
+        }
+    };

    Ok((layer, did_hardlink))
 }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -93,7 +93,8 @@ impl Timeline {
            }
        }

-        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
+        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn)
+            .with_scope_timeline(&self);
        loop {
            let policy = self.get_eviction_policy();
            let cf = self
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,5 +1,4 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
+//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
 //!
 //! # Motivation
 //!
@@ -19,27 +18,32 @@
 //! we hold the Timeline gate open while we're invoking the method on the
 //! Timeline object.
 //!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
+//! We want to avoid the overhead of doing, for each incoming request,
+//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//!   release the mgr rwlock before doing any request processing work
+//! - re-entering the Timeline gate for each Timeline method invocation.
 //!
 //! Regardless of how we accomplish the above, it should not
 //! prevent the Timeline from shutting down promptly.
 //!
+//!
 //! # Design
 //!
 //! ## Data Structures
 //!
-//! There are three user-facing data structures:
+//! There are two concepts expressed as associated types in the `Types` trait:
+//! - `TenantManager`: the thing that performs the expensive work. It produces
+//!   a `Timeline` object, which is the other associated type.
+//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
+//!
+//! There are three user-facing data structures exposed by this module:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
+//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
 //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
-//!   trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
+//!   trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
+//!   point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
 //!
 //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
 //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
@@ -64,11 +68,14 @@
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
-//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
+//! A cache miss means we call Types::TenantManager::resolve for shard routing,
+//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
+//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
+//!
+//! We wrap the object returned from resolve() in an `Arc` and store that inside the
+//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
 //! and a strong ref in the `PerTimelineState`.
-//! A strong ref is returned wrapped in a `Handle`.
+//! Another strong ref is returned wrapped in a `Handle`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
 //! and find the weak ref in the cache.
@@ -78,51 +85,51 @@
 //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
 //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
 //! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
+//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
 //!
 //! # Performance
 //!
 //! Remember from the introductory section:
 //!
-//! > However, we want to avoid the overhead of entering the gate for every
-//! > method invocation.
+//! > We want to avoid the overhead of doing, for each incoming request,
+//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//! >   release the mgr rwlock before doing any request processing work
+//! > - re-entering the Timeline gate for each Timeline method invocation.
 //!
-//! Why do we want to avoid that?
-//! Because the gate is a shared location in memory and entering it involves
-//! bumping refcounts, which leads to cache contention if done frequently
-//! from multiple cores in parallel.
+//! All of these boil down to some state that is either globally shared among all shards
+//! or state shared among all tasks that serve a particular timeline.
+//! It is either protected by RwLock or manipulated via atomics.
+//! Even atomics are costly when shared across multiple cores.
+//! So, we want to avoid any permanent need for coordination between page_service tasks.
 //!
-//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
-//! That `Arc` is private to the `HandleInner` and hence to the connection.
+//! The solution is to add indirection: we wrap the Types::Timeline object that is
+//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
+//! and hence to the single Cache / page_service connection.
 //! (Review the "Data Structures" section if that is unclear to you.)
 //!
-//! A `WeakHandle` is a weak ref to the `HandleInner`.
-//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
-//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
-//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
 //!
-//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
-//! Again, this is cheap because the `Arc` is private to the connection.
+//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
+//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
+//! The Mutex is not contended because it is private to the connection.
+//! And again, the  `Arc<Types::Timeline>` clone is cheap because that wrapper
+//! Arc's refcounts are private to the connection.
+//!
+//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
 //!
-//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
-//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
-//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
-//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
-//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
-//! so that we can clone it cheaply when upgrading a `WeakHandle`.
 //!
 //! # Shutdown
 //!
 //! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
 //! ```
 //!
 //! Further, there is this cycle:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
 //! ```
 //!
 //! The former cycle is a memory leak if not broken.
@@ -135,9 +142,12 @@
 //! - Timeline shutdown (=> `PerTimelineState::shutdown`)
 //! - Connection shutdown (=> dropping the `Cache`).
 //!
-//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
-//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
-//! `Arc<GateGuard>`.
+//! Both transition the `HandleInner` from [`HandleInner::Open`] to
+//! [`HandleInner::ShutDown`], which drops the only long-lived
+//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
+//! is dropped, the `Types::Timeline` gets dropped and thereby
+//! the `GateGuard` and the `Arc<Timeline>` that it stores,
+//! thereby breaking both cycles.
 //!
 //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
 //! thereby breaking the cycle.
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
 pub(crate) trait Types: Sized + std::fmt::Debug {
    type TenantManagerError: Sized + std::fmt::Debug;
    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
+    type Timeline: Timeline<Self> + Sized;
 }

 /// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {

 /// See module-level comment.
 pub(crate) struct Handle<T: Types> {
-    timeline: Arc<T::Timeline>,
-    #[allow(dead_code)] // the field exists to keep the gate open
-    gate_guard: Arc<utils::sync::gate::GateGuard>,
    inner: Arc<Mutex<HandleInner<T>>>,
+    open: Arc<T::Timeline>,
 }
 pub(crate) struct WeakHandle<T: Types> {
    inner: Weak<Mutex<HandleInner<T>>>,
 }
+
 enum HandleInner<T: Types> {
-    KeepingTimelineGateOpen {
-        #[allow(dead_code)]
-        gate_guard: Arc<utils::sync::gate::GateGuard>,
-        timeline: Arc<T::Timeline>,
-    },
+    Open(Arc<T::Timeline>),
    ShutDown,
 }

@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
 }

 /// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
+pub(crate) trait Timeline<T: Types> {
    fn shard_timeline_id(&self) -> ShardTimelineId;
    fn get_shard_identity(&self) -> &ShardIdentity;
    fn per_timeline_state(&self) -> &PerTimelineState<T>;
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
 #[derive(Debug)]
 pub(crate) enum GetError<T: Types> {
    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
    PerTimelineStateShutDown,
 }

@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
                }

                trace!("creating new HandleInner");
-                let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
-                    gate_guard: Arc::new(
-                        // this enter() is expensive in production code because
-                        // it hits the global Arc<Timeline>::gate refcounts
-                        match timeline.gate().enter() {
-                            Ok(guard) => guard,
-                            Err(_) => {
-                                return Err(GetError::TimelineGateClosed);
-                            }
-                        },
-                    ),
-                    // this clone is expensive in production code because
-                    // it hits the global Arc<Timeline>::clone refcounts
-                    timeline: Arc::new(timeline.clone()),
-                }));
+                let timeline = Arc::new(timeline);
+                let handle_inner_arc =
+                    Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
                let handle_weak = WeakHandle {
                    inner: Arc::downgrade(&handle_inner_arc),
                };
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
        };
        let lock_guard = inner.lock().expect("poisoned");
        match &*lock_guard {
-            HandleInner::KeepingTimelineGateOpen {
-                timeline,
-                gate_guard,
-            } => {
-                let gate_guard = Arc::clone(gate_guard);
-                let timeline = Arc::clone(timeline);
+            HandleInner::Open(open) => {
+                let open = Arc::clone(open);
                drop(lock_guard);
-                Ok(Handle {
-                    timeline,
-                    gate_guard,
-                    inner,
-                })
+                Ok(Handle { open, inner })
            }
            HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
        }
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
 impl<T: Types> std::ops::Deref for Handle<T> {
    type Target = T::Timeline;
    fn deref(&self) -> &Self::Target {
-        &self.timeline
+        &self.open
    }
 }

@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
    /// to the [`Types::Timeline`] that embeds this per-timeline state.
    /// Even if [`TenantManager::resolve`] would still resolve to it.
    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
+    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
    /// That's ok because they're short-lived. See module-level comment for details.
    #[instrument(level = "trace", skip_all)]
    pub(super) fn shutdown(&self) {
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
 impl<T: Types> HandleInner<T> {
    fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
        match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
+            HandleInner::Open(timeline) => Some(timeline),
            HandleInner::ShutDown => {
                // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
                // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
@@ -631,6 +614,7 @@ mod tests {
    use pageserver_api::reltag::RelTag;
    use pageserver_api::shard::ShardStripeSize;
    use utils::shard::ShardCount;
+    use utils::sync::gate::GateGuard;

    use super::*;

@@ -641,7 +625,7 @@ mod tests {
    impl Types for TestTypes {
        type TenantManagerError = anyhow::Error;
        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
+        type Timeline = Entered;
    }

    struct StubManager {
@@ -656,17 +640,19 @@ mod tests {
        myself: Weak<StubTimeline>,
    }

+    struct Entered {
+        timeline: Arc<StubTimeline>,
+        #[allow(dead_code)] // it's stored here to keep the gate open
+        gate_guard: Arc<GateGuard>,
+    }
+
    impl StubTimeline {
        fn getpage(&self) {
            // do nothing
        }
    }

-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
+    impl Timeline<TestTypes> for Entered {
        fn shard_timeline_id(&self) -> ShardTimelineId {
            ShardTimelineId {
                shard_index: self.shard.shard_index(),
@@ -688,20 +674,34 @@ mod tests {
            &self,
            timeline_id: TimelineId,
            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
+        ) -> anyhow::Result<Entered> {
            for timeline in &self.shards {
                if timeline.id == timeline_id {
+                    let enter_gate = || {
+                        let gate_guard = timeline.gate.enter()?;
+                        let gate_guard = Arc::new(gate_guard);
+                        anyhow::Ok(gate_guard)
+                    };
                    match &shard_selector {
                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Zero => continue,
                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Page(_) => continue,
                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Known(_) => continue,
                    }
@@ -711,6 +711,13 @@ mod tests {
        }
    }

+    impl std::ops::Deref for Entered {
+        type Target = StubTimeline;
+        fn deref(&self) -> &Self::Target {
+            &self.timeline
+        }
+    }
+
    #[tokio::test(start_paused = true)]
    async fn test_timeline_shutdown() {
        crate::tenant::harness::setup_logging();
@@ -1038,7 +1045,6 @@ mod tests {
        let key = DBDIR_KEY;

        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
        for _ in 0..10 {
            let mut cache = Cache::<TestTypes>::default();
            let handle = {
@@ -1050,7 +1056,6 @@ mod tests {
                handle
            };
            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.timeline));
        }

        // No handles exist, thus gates are closed and don't require shutdown.
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -10,6 +10,8 @@ use http_utils::error::ApiError;
 use tokio_util::sync::CancellationToken;
 use utils::sync::gate::Gate;

+use crate::context::RequestContext;
+
 use super::Timeline;

 // This status is not strictly necessary now, but gives us a nice place
@@ -30,6 +32,8 @@ impl HeatmapLayersDownloader {
    fn new(
        timeline: Arc<Timeline>,
        concurrency: usize,
+        recurse: bool,
+        ctx: RequestContext,
    ) -> Result<HeatmapLayersDownloader, ApiError> {
        let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;

@@ -57,12 +61,13 @@ impl HeatmapLayersDownloader {

                tracing::info!(
                    resident_size=%timeline.resident_physical_size(),
-                    heatmap_layers=%heatmap.layers.len(),
+                    heatmap_layers=%heatmap.all_layers().count(),
                    "Starting heatmap layers download"
                );

-                let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
+                let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
                    |layer| {
+                        let ctx = ctx.attached_child();
                        let tl = timeline.clone();
                        let dl_guard = match downloads_guard.enter() {
                            Ok(g) => g,
@@ -75,7 +80,7 @@ impl HeatmapLayersDownloader {
                        Some(async move {
                            let _dl_guard = dl_guard;

-                            let res = tl.download_layer(&layer.name).await;
+                            let res = tl.download_layer(&layer.name, &ctx).await;
                            if let Err(err) = res {
                                if !err.is_cancelled() {
                                    tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
@@ -94,6 +99,20 @@ impl HeatmapLayersDownloader {
                    },
                    _ = cancel.cancelled() => {
                        tracing::info!("Heatmap layers download cancelled");
+                        return;
+                    }
+                }
+
+                if recurse {
+                    if let Some(ancestor) = timeline.ancestor_timeline() {
+                        let ctx = ctx.attached_child();
+                        let res =
+                            ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx);
+                        if let Err(err) = res {
+                            tracing::info!(
+                                "Failed to start heatmap layers download for ancestor: {err}"
+                            );
+                        }
                    }
                }
            }
@@ -136,13 +155,20 @@ impl HeatmapLayersDownloader {
 }

 impl Timeline {
-    pub(crate) async fn start_heatmap_layers_download(
+    pub(crate) fn start_heatmap_layers_download(
        self: &Arc<Self>,
        concurrency: usize,
+        recurse: bool,
+        ctx: &RequestContext,
    ) -> Result<(), ApiError> {
        let mut locked = self.heatmap_layers_downloader.lock().unwrap();
        if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
-            let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
+            let dl = HeatmapLayersDownloader::new(
+                self.clone(),
+                concurrency,
+                recurse,
+                ctx.attached_child(),
+            )?;
            *locked = Some(dl);
            Ok(())
        } else {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,14 +8,14 @@ use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};

-use super::TimelineWriterState;
+use super::{ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
 use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
 use crate::tenant::storage_layer::{
    AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
-    PersistentLayerKey, ResidentLayer,
+    PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
 };

 /// Provides semantic APIs to manipulate the layer map.
@@ -37,6 +37,21 @@ impl Default for LayerManager {
 }

 impl LayerManager {
+    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+        match weak {
+            ReadableLayerWeak::PersistentLayer(desc) => {
+                ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
+            }
+            ReadableLayerWeak::InMemoryLayer(desc) => {
+                let inmem = self
+                    .layer_map()
+                    .expect("no concurrent shutdown")
+                    .in_memory_layer(&desc);
+                ReadableLayer::InMemoryLayer(inmem)
+            }
+        }
+    }
+
    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
@@ -470,6 +485,25 @@ impl OpenLayerManager {
        mapping.remove(layer);
        layer.delete_on_drop();
    }
+
+    #[cfg(test)]
+    pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
+        use pageserver_api::models::InMemoryLayerInfo;
+
+        match layer.info() {
+            InMemoryLayerInfo::Open { .. } => {
+                assert!(self.layer_map.open_layer.is_none());
+                self.layer_map.open_layer = Some(layer);
+            }
+            InMemoryLayerInfo::Frozen { lsn_start, .. } => {
+                if let Some(last) = self.layer_map.frozen_layers.back() {
+                    assert!(last.get_lsn_range().end <= lsn_start);
+                }
+
+                self.layer_map.frozen_layers.push_back(layer);
+            }
+        }
+    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -961,7 +961,8 @@ mod tests {
    }

    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let (_temp_dir, pathbuf, offsets) =
            write_maybe_compressed(blobs, compression, &ctx).await?;

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -26,15 +26,14 @@ use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 pub use pageserver_api::models::virtual_file as api;
-use pageserver_api::shard::TenantShardId;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

+use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::RequestContext;
-use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation};
+use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
 use crate::page_cache::{PAGE_SZ, PageWriteGuard};
-use crate::tenant::TENANTS_SEGMENT_NAME;
 pub(crate) mod io_engine;
 pub use io_engine::{
    FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test,
@@ -121,7 +120,7 @@ impl VirtualFile {
    pub async fn open_with_options<P: AsRef<Utf8Path>>(
        path: P,
        open_options: &OpenOptions,
-        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> Result<Self, std::io::Error> {
        let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
        Ok(VirtualFile {
@@ -133,7 +132,7 @@ impl VirtualFile {
    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
        path: P,
        open_options: &OpenOptions,
-        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> Result<Self, std::io::Error> {
        let file = match get_io_mode() {
            IoMode::Buffered => {
@@ -300,13 +299,6 @@ pub struct VirtualFileInner {
    /// storing it here.
    pub path: Utf8PathBuf,
    open_options: OpenOptions,
-
-    // These are strings becase we only use them for metrics, and those expect strings.
-    // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
-    // strings.
-    tenant_id: String,
-    shard_id: String,
-    timeline_id: String,
 }

 #[derive(Debug, PartialEq, Clone, Copy)]
@@ -588,36 +580,16 @@ impl VirtualFileInner {
    pub async fn open_with_options<P: AsRef<Utf8Path>>(
        path: P,
        open_options: &OpenOptions,
-        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+        _ctx: &RequestContext,
    ) -> Result<VirtualFileInner, std::io::Error> {
-        let path_ref = path.as_ref();
-        let path_str = path_ref.to_string();
-        let parts = path_str.split('/').collect::<Vec<&str>>();
-        let (tenant_id, shard_id, timeline_id) =
-            if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
-                let tenant_shard_part = parts[parts.len() - 4];
-                let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
-                    Ok(tenant_shard_id) => (
-                        tenant_shard_id.tenant_id.to_string(),
-                        format!("{}", tenant_shard_id.shard_slug()),
-                    ),
-                    Err(_) => {
-                        // Malformed path: this ID is just for observability, so tolerate it
-                        // and pass through
-                        (tenant_shard_part.to_string(), "*".to_string())
-                    }
-                };
-                (tenant_id, shard_id, parts[parts.len() - 2].to_string())
-            } else {
-                ("*".to_string(), "*".to_string(), "*".to_string())
-            };
+        let path = path.as_ref();
        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;

        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
        // where our caller doesn't get to use the returned VirtualFile before its
        // slot gets re-used by someone else.
        let file = observe_duration!(StorageIoOperation::Open, {
-            open_options.open(path_ref.as_std_path()).await?
+            open_options.open(path.as_std_path()).await?
        });

        // Strip all options other than read and write.
@@ -633,11 +605,8 @@ impl VirtualFileInner {
        let vfile = VirtualFileInner {
            handle: RwLock::new(handle),
            pos: 0,
-            path: path_ref.to_path_buf(),
+            path: path.to_owned(),
            open_options: reopen_options,
-            tenant_id,
-            shard_id,
-            timeline_id,
        };

        // TODO: Under pressure, it's likely the slot will get re-used and
@@ -934,7 +903,7 @@ impl VirtualFileInner {
        &self,
        buf: tokio_epoll_uring::Slice<Buf>,
        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
    where
        Buf: tokio_epoll_uring::IoBufMut + Send,
@@ -952,14 +921,7 @@ impl VirtualFileInner {
            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
            let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
            if let Ok(size) = res {
-                STORAGE_IO_SIZE
-                    .with_label_values(&[
-                        "read",
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
-                    ])
-                    .add(size as i64);
+                ctx.io_size_metrics().read.add(size.into_u64());
            }
            (buf, res)
        })
@@ -970,9 +932,9 @@ impl VirtualFileInner {
        &self,
        buf: FullSlice<B>,
        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> (FullSlice<B>, Result<usize, Error>) {
-        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
+        let (slice, result) = self.write_at_inner(buf, offset, ctx).await;
        let result = result.maybe_fatal_err("write_at");
        (slice, result)
    }
@@ -981,7 +943,7 @@ impl VirtualFileInner {
        &self,
        buf: FullSlice<B>,
        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
@@ -991,14 +953,7 @@ impl VirtualFileInner {
            let ((_file_guard, buf), result) =
                io_engine::get().write_at(file_guard, offset, buf).await;
            if let Ok(size) = result {
-                STORAGE_IO_SIZE
-                    .with_label_values(&[
-                        "write",
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
-                    ])
-                    .add(size as i64);
+                ctx.io_size_metrics().write.add(size.into_u64());
            }
            (buf, result)
        })
@@ -1584,7 +1539,8 @@ mod tests {
    where
        A: Adapter,
    {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
        std::fs::create_dir_all(&testdir)?;

@@ -1711,7 +1667,8 @@ mod tests {
        const THREADS: usize = 100;
        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];

-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
        std::fs::create_dir_all(&testdir)?;

@@ -1770,7 +1727,8 @@ mod tests {

    #[tokio::test]
    async fn test_atomic_overwrite_basic() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
        std::fs::create_dir_all(&testdir).unwrap();

@@ -1798,7 +1756,8 @@ mod tests {

    #[tokio::test]
    async fn test_atomic_overwrite_preexisting_tmp() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir =
            crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
        std::fs::create_dir_all(&testdir).unwrap();
--- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -181,7 +181,8 @@ where
        Err(self
            .shutdown()
            .await
-            .expect_err("flush task only disconnects duplex if it exits with an error"))
+            .err()
+            .expect("flush task only disconnects duplex if it exits with an error"))
    }

    /// Cleans up the channel, join the flush task.
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -136,7 +136,9 @@ impl WalRedoProcess {
                        Ok(0) => break Ok(()), // eof
                        Ok(num_bytes) => {
                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
+                            if !output.contains("LOG:") {
+                               error!(%output, "received output");
+                            }
                        }
                        Err(e) => {
                            break Err(e);