Merge WITH CONFLICTS 2025-03-11 main commit '7c462b3417ecd3ae3907f3480f3b8a8c99fc6d7b' into yuchen/dire

ct-io-delta-image-layer-write Conflicts: pageserver/src/tenant/blob_io.rs
2026-05-30 11:30:37 +00:00 · 2025-04-09 19:39:12 +02:00
parent 537eb334f2 7c462b3417
commit f078d7e1a9
380 changed files with 13238 additions and 5308 deletions
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -48,6 +48,9 @@ pprof.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
+rustls-pemfile.workspace = true
+rustls-pki-types.workspace = true
+rustls.workspace = true
 scopeguard.workspace = true
 send-future.workspace = true
 serde.workspace = true
@@ -62,6 +65,7 @@ tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util"
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
+tokio-rustls.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
@@ -98,6 +102,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
 indoc.workspace = true
+uuid.workspace = true

 [[bench]]
 name = "bench_layer_map"
@@ -115,6 +120,10 @@ harness = false
 name = "upload_queue"
 harness = false

+[[bench]]
+name = "bench_metrics"
+harness = false
+
 [[bin]]
 name = "test_helper_slow_client_reads"
 required-features = [ "testing" ]
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -7,7 +7,6 @@ use std::time::Instant;

 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
-use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
 use pageserver_api::key::Key;
@@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
        .collect()
 }

-// Construct a partitioning for testing get_difficulty map when we
-// don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
-    let mut parts = Vec::new();
-
-    // We add a partition boundary at the start of each image layer,
-    // no matter what lsn range it covers. This is just the easiest
-    // thing to do. A better thing to do would be to get a real
-    // partitioning from some database. Even better, remove the need
-    // for key partitions by deciding where to create image layers
-    // directly based on a coverage-based difficulty map.
-    let mut keys: Vec<_> = layer_map
-        .iter_historic_layers()
-        .filter_map(|l| {
-            if l.is_incremental() {
-                None
-            } else {
-                let kr = l.get_key_range();
-                Some(kr.start.next())
-            }
-        })
-        .collect();
-    keys.sort();
-
-    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
-    for key in keys {
-        parts.push(KeySpace {
-            ranges: vec![current_key..key],
-        });
-        current_key = key;
-    }
-
-    KeyPartitioning { parts }
-}
-
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) {
    // Choose uniformly distributed queries
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);

-    // Choose inputs for get_difficulty_map
-    let latest_lsn = layer_map
-        .iter_historic_layers()
-        .map(|l| l.get_lsn_range().end)
-        .max()
-        .unwrap();
-    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
-
-    // Check correctness of get_difficulty_map
-    // TODO put this in a dedicated test outside of this mod
-    {
-        println!("running correctness check");
-
-        let now = Instant::now();
-        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
-        assert!(result_bruteforce.len() == partitioning.parts.len());
-        println!("Finished bruteforce in {:?}", now.elapsed());
-
-        let now = Instant::now();
-        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
-        assert!(result_fast.len() == partitioning.parts.len());
-        println!("Finished fast in {:?}", now.elapsed());
-
-        // Assert results are equal. Manually iterate for easier debugging.
-        let zip = std::iter::zip(
-            &partitioning.parts,
-            std::iter::zip(result_bruteforce, result_fast),
-        );
-        for (_part, (bruteforce, fast)) in zip {
-            assert_eq!(bruteforce, fast);
-        }
-
-        println!("No issues found");
-    }
-
    // Define and name the benchmark function
    let mut group = c.benchmark_group("real_map");
    group.bench_function("uniform_queries", |b| {
@@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) {
            }
        });
    });
-    group.bench_function("get_difficulty_map", |b| {
-        b.iter(|| {
-            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
-        });
-    });
    group.finish();
 }

--- a/pageserver/benches/bench_metrics.rs
+++ b/pageserver/benches/bench_metrics.rs
@@ -0,0 +1,366 @@
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use utils::id::{TenantId, TimelineId};
+
+//
+// Demonstrates that repeat label values lookup is a multicore scalability bottleneck
+// that is worth avoiding.
+//
+criterion_group!(
+    label_values,
+    label_values::bench_naive_usage,
+    label_values::bench_cache_label_values_lookup
+);
+mod label_values {
+    use super::*;
+
+    pub fn bench_naive_usage(c: &mut Criterion) {
+        let mut g = c.benchmark_group("label_values__naive_usage");
+
+        for ntimelines in [1, 4, 8] {
+            g.bench_with_input(
+                BenchmarkId::new("ntimelines", ntimelines),
+                &ntimelines,
+                |b, ntimelines| {
+                    b.iter_custom(|iters| {
+                        let barrier = std::sync::Barrier::new(*ntimelines + 1);
+
+                        let timelines = (0..*ntimelines)
+                            .map(|_| {
+                                (
+                                    TenantId::generate().to_string(),
+                                    "0000".to_string(),
+                                    TimelineId::generate().to_string(),
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let metric_vec = metrics::UIntGaugeVec::new(
+                            metrics::opts!("testmetric", "testhelp"),
+                            &["tenant_id", "shard_id", "timeline_id"],
+                        )
+                        .unwrap();
+
+                        std::thread::scope(|s| {
+                            for (tenant_id, shard_id, timeline_id) in &timelines {
+                                s.spawn(|| {
+                                    barrier.wait();
+                                    for _ in 0..iters {
+                                        metric_vec
+                                            .with_label_values(&[tenant_id, shard_id, timeline_id])
+                                            .inc();
+                                    }
+                                    barrier.wait();
+                                });
+                            }
+                            barrier.wait();
+                            let start = std::time::Instant::now();
+                            barrier.wait();
+                            start.elapsed()
+                        })
+                    })
+                },
+            );
+        }
+        g.finish();
+    }
+
+    pub fn bench_cache_label_values_lookup(c: &mut Criterion) {
+        let mut g = c.benchmark_group("label_values__cache_label_values_lookup");
+
+        for ntimelines in [1, 4, 8] {
+            g.bench_with_input(
+                BenchmarkId::new("ntimelines", ntimelines),
+                &ntimelines,
+                |b, ntimelines| {
+                    b.iter_custom(|iters| {
+                        let barrier = std::sync::Barrier::new(*ntimelines + 1);
+
+                        let timelines = (0..*ntimelines)
+                            .map(|_| {
+                                (
+                                    TenantId::generate().to_string(),
+                                    "0000".to_string(),
+                                    TimelineId::generate().to_string(),
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let metric_vec = metrics::UIntGaugeVec::new(
+                            metrics::opts!("testmetric", "testhelp"),
+                            &["tenant_id", "shard_id", "timeline_id"],
+                        )
+                        .unwrap();
+
+                        std::thread::scope(|s| {
+                            for (tenant_id, shard_id, timeline_id) in &timelines {
+                                s.spawn(|| {
+                                    let metric = metric_vec.with_label_values(&[
+                                        tenant_id,
+                                        shard_id,
+                                        timeline_id,
+                                    ]);
+                                    barrier.wait();
+                                    for _ in 0..iters {
+                                        metric.inc();
+                                    }
+                                    barrier.wait();
+                                });
+                            }
+                            barrier.wait();
+                            let start = std::time::Instant::now();
+                            barrier.wait();
+                            start.elapsed()
+                        })
+                    })
+                },
+            );
+        }
+        g.finish();
+    }
+}
+
+//
+// Demonstrates that even a single metric can be a scalability bottleneck
+// if multiple threads in it concurrently but there's nothing we can do
+// about it without changing the metrics framework to use e.g. sharded counte atomics.
+//
+criterion_group!(
+    single_metric_multicore_scalability,
+    single_metric_multicore_scalability::bench,
+);
+mod single_metric_multicore_scalability {
+    use super::*;
+
+    pub fn bench(c: &mut Criterion) {
+        let mut g = c.benchmark_group("single_metric_multicore_scalability");
+
+        for nthreads in [1, 4, 8] {
+            g.bench_with_input(
+                BenchmarkId::new("nthreads", nthreads),
+                &nthreads,
+                |b, nthreads| {
+                    b.iter_custom(|iters| {
+                        let barrier = std::sync::Barrier::new(*nthreads + 1);
+
+                        let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
+
+                        std::thread::scope(|s| {
+                            for _ in 0..*nthreads {
+                                s.spawn(|| {
+                                    barrier.wait();
+                                    for _ in 0..iters {
+                                        metric.inc();
+                                    }
+                                    barrier.wait();
+                                });
+                            }
+                            barrier.wait();
+                            let start = std::time::Instant::now();
+                            barrier.wait();
+                            start.elapsed()
+                        })
+                    })
+                },
+            );
+        }
+        g.finish();
+    }
+}
+
+//
+// Demonstrates that even if we cache label value, the propagation of such a cached metric value
+// by Clone'ing it is a scalability bottleneck.
+// The reason is that it's an Arc internally and thus there's contention on the reference count atomics.
+//
+// We can avoid that by having long-lived references per thread (= indirection).
+//
+criterion_group!(
+    propagation_of_cached_label_value,
+    propagation_of_cached_label_value::bench_naive,
+    propagation_of_cached_label_value::bench_long_lived_reference_per_thread,
+);
+mod propagation_of_cached_label_value {
+    use std::sync::Arc;
+
+    use super::*;
+
+    pub fn bench_naive(c: &mut Criterion) {
+        let mut g = c.benchmark_group("propagation_of_cached_label_value__naive");
+
+        for nthreads in [1, 4, 8] {
+            g.bench_with_input(
+                BenchmarkId::new("nthreads", nthreads),
+                &nthreads,
+                |b, nthreads| {
+                    b.iter_custom(|iters| {
+                        let barrier = std::sync::Barrier::new(*nthreads + 1);
+
+                        let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
+
+                        std::thread::scope(|s| {
+                            for _ in 0..*nthreads {
+                                s.spawn(|| {
+                                    barrier.wait();
+                                    for _ in 0..iters {
+                                        // propagating the metric means we'd clone it into the child RequestContext
+                                        let propagated = metric.clone();
+                                        // simulate some work
+                                        criterion::black_box(propagated);
+                                    }
+                                    barrier.wait();
+                                });
+                            }
+                            barrier.wait();
+                            let start = std::time::Instant::now();
+                            barrier.wait();
+                            start.elapsed()
+                        })
+                    })
+                },
+            );
+        }
+        g.finish();
+    }
+
+    pub fn bench_long_lived_reference_per_thread(c: &mut Criterion) {
+        let mut g =
+            c.benchmark_group("propagation_of_cached_label_value__long_lived_reference_per_thread");
+
+        for nthreads in [1, 4, 8] {
+            g.bench_with_input(
+                BenchmarkId::new("nthreads", nthreads),
+                &nthreads,
+                |b, nthreads| {
+                    b.iter_custom(|iters| {
+                        let barrier = std::sync::Barrier::new(*nthreads + 1);
+
+                        let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
+
+                        std::thread::scope(|s| {
+                            for _ in 0..*nthreads {
+                                s.spawn(|| {
+                                    // This is the technique.
+                                    let this_threads_metric_reference = Arc::new(metric.clone());
+
+                                    barrier.wait();
+                                    for _ in 0..iters {
+                                        // propagating the metric means we'd clone it into the child RequestContext
+                                        let propagated = Arc::clone(&this_threads_metric_reference);
+                                        // simulate some work (include the pointer chase!)
+                                        criterion::black_box(&*propagated);
+                                    }
+                                    barrier.wait();
+                                });
+                            }
+                            barrier.wait();
+                            let start = std::time::Instant::now();
+                            barrier.wait();
+                            start.elapsed()
+                        })
+                    })
+                },
+            );
+        }
+    }
+}
+
+criterion_main!(
+    label_values,
+    single_metric_multicore_scalability,
+    propagation_of_cached_label_value
+);
+
+/*
+RUST_BACKTRACE=full cargo bench --bench bench_metrics --  --discard-baseline --noplot
+
+Results on an im4gn.2xlarge instance
+
+label_values__naive_usage/ntimelines/1 time:   [178.71 ns 178.74 ns 178.76 ns]
+label_values__naive_usage/ntimelines/4 time:   [532.94 ns 539.59 ns 546.31 ns]
+label_values__naive_usage/ntimelines/8 time:   [1.1082 µs 1.1109 µs 1.1135 µs]
+label_values__cache_label_values_lookup/ntimelines/1 time:   [6.4116 ns 6.4119 ns 6.4123 ns]
+label_values__cache_label_values_lookup/ntimelines/4 time:   [6.3482 ns 6.3819 ns 6.4079 ns]
+label_values__cache_label_values_lookup/ntimelines/8 time:   [6.4213 ns 6.5279 ns 6.6293 ns]
+single_metric_multicore_scalability/nthreads/1 time:   [6.0102 ns 6.0104 ns 6.0106 ns]
+single_metric_multicore_scalability/nthreads/4 time:   [38.127 ns 38.275 ns 38.416 ns]
+single_metric_multicore_scalability/nthreads/8 time:   [73.698 ns 74.882 ns 75.864 ns]
+propagation_of_cached_label_value__naive/nthreads/1 time:   [14.424 ns 14.425 ns 14.426 ns]
+propagation_of_cached_label_value__naive/nthreads/4 time:   [100.71 ns 102.53 ns 104.35 ns]
+propagation_of_cached_label_value__naive/nthreads/8 time:   [211.50 ns 214.44 ns 216.87 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time:   [14.135 ns 14.147 ns 14.160 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time:   [14.243 ns 14.255 ns 14.268 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time:   [14.470 ns 14.682 ns 14.895 ns]
+
+Results on an i3en.3xlarge instance
+
+label_values__naive_usage/ntimelines/1      time:   [117.32 ns 117.53 ns 117.74 ns]
+label_values__naive_usage/ntimelines/4      time:   [736.58 ns 741.12 ns 745.61 ns]
+label_values__naive_usage/ntimelines/8      time:   [1.4513 µs 1.4596 µs 1.4665 µs]
+label_values__cache_label_values_lookup/ntimelines/1      time:   [8.0964 ns 8.0979 ns 8.0995 ns]
+label_values__cache_label_values_lookup/ntimelines/4      time:   [8.1620 ns 8.2912 ns 8.4491 ns]
+label_values__cache_label_values_lookup/ntimelines/8      time:   [14.148 ns 14.237 ns 14.324 ns]
+single_metric_multicore_scalability/nthreads/1      time:   [8.0993 ns 8.1013 ns 8.1046 ns]
+single_metric_multicore_scalability/nthreads/4      time:   [80.039 ns 80.672 ns 81.297 ns]
+single_metric_multicore_scalability/nthreads/8      time:   [153.58 ns 154.23 ns 154.90 ns]
+propagation_of_cached_label_value__naive/nthreads/1     time:   [13.924 ns 13.926 ns 13.928 ns]
+propagation_of_cached_label_value__naive/nthreads/4     time:   [143.66 ns 145.27 ns 146.59 ns]
+propagation_of_cached_label_value__naive/nthreads/8     time:   [296.51 ns 297.90 ns 299.30 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [14.013 ns 14.149 ns 14.308 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [14.311 ns 14.625 ns 14.984 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [25.981 ns 26.227 ns 26.476 ns]
+
+Results on an Standard L16s v3 (16 vcpus, 128 GiB memory)  Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
+
+label_values__naive_usage/ntimelines/1      time:   [101.63 ns 101.84 ns 102.06 ns]
+label_values__naive_usage/ntimelines/4      time:   [417.55 ns 424.73 ns 432.63 ns]
+label_values__naive_usage/ntimelines/8      time:   [874.91 ns 889.51 ns 904.25 ns]
+label_values__cache_label_values_lookup/ntimelines/1      time:   [5.7724 ns 5.7760 ns 5.7804 ns]
+label_values__cache_label_values_lookup/ntimelines/4      time:   [7.8878 ns 7.9401 ns 8.0034 ns]
+label_values__cache_label_values_lookup/ntimelines/8      time:   [7.2621 ns 7.6354 ns 8.0337 ns]
+single_metric_multicore_scalability/nthreads/1      time:   [5.7710 ns 5.7744 ns 5.7785 ns]
+single_metric_multicore_scalability/nthreads/4      time:   [66.629 ns 66.994 ns 67.336 ns]
+single_metric_multicore_scalability/nthreads/8      time:   [130.85 ns 131.98 ns 132.91 ns]
+propagation_of_cached_label_value__naive/nthreads/1     time:   [11.540 ns 11.546 ns 11.553 ns]
+propagation_of_cached_label_value__naive/nthreads/4     time:   [131.22 ns 131.90 ns 132.56 ns]
+propagation_of_cached_label_value__naive/nthreads/8     time:   [260.99 ns 262.75 ns 264.26 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [11.544 ns 11.550 ns 11.557 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [11.568 ns 11.642 ns 11.763 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [13.416 ns 14.121 ns 14.886 ns
+
+Results on an M4 MAX MacBook Pro   Total Number of Cores:	14 (10 performance and 4 efficiency)
+
+label_values__naive_usage/ntimelines/1      time:   [52.711 ns 53.026 ns 53.381 ns]
+label_values__naive_usage/ntimelines/4      time:   [323.99 ns 330.40 ns 337.53 ns]
+label_values__naive_usage/ntimelines/8      time:   [1.1615 µs 1.1998 µs 1.2399 µs]
+label_values__cache_label_values_lookup/ntimelines/1      time:   [1.6635 ns 1.6715 ns 1.6809 ns]
+label_values__cache_label_values_lookup/ntimelines/4      time:   [1.7786 ns 1.7876 ns 1.8028 ns]
+label_values__cache_label_values_lookup/ntimelines/8      time:   [1.8195 ns 1.8371 ns 1.8665 ns]
+single_metric_multicore_scalability/nthreads/1      time:   [1.7764 ns 1.7909 ns 1.8079 ns]
+single_metric_multicore_scalability/nthreads/4      time:   [33.875 ns 34.868 ns 35.923 ns]
+single_metric_multicore_scalability/nthreads/8      time:   [226.85 ns 235.30 ns 244.18 ns]
+propagation_of_cached_label_value__naive/nthreads/1     time:   [3.4337 ns 3.4491 ns 3.4660 ns]
+propagation_of_cached_label_value__naive/nthreads/4     time:   [69.486 ns 71.937 ns 74.472 ns]
+propagation_of_cached_label_value__naive/nthreads/8     time:   [434.87 ns 456.47 ns 477.84 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [3.3767 ns 3.3974 ns 3.4220 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [3.6105 ns 4.2355 ns 5.1463 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [4.0889 ns 4.9714 ns 6.0779 ns]
+
+Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor
+
+label_values__naive_usage/ntimelines/1      time:   [64.510 ns 64.559 ns 64.610 ns]
+label_values__naive_usage/ntimelines/4      time:   [309.71 ns 326.09 ns 342.32 ns]
+label_values__naive_usage/ntimelines/8      time:   [776.92 ns 819.35 ns 856.93 ns]
+label_values__cache_label_values_lookup/ntimelines/1      time:   [1.2855 ns 1.2943 ns 1.3021 ns]
+label_values__cache_label_values_lookup/ntimelines/4      time:   [1.3865 ns 1.4139 ns 1.4441 ns]
+label_values__cache_label_values_lookup/ntimelines/8      time:   [1.5311 ns 1.5669 ns 1.6046 ns]
+single_metric_multicore_scalability/nthreads/1      time:   [1.1927 ns 1.1981 ns 1.2049 ns]
+single_metric_multicore_scalability/nthreads/4      time:   [24.346 ns 25.439 ns 26.634 ns]
+single_metric_multicore_scalability/nthreads/8      time:   [58.666 ns 60.137 ns 61.486 ns]
+propagation_of_cached_label_value__naive/nthreads/1     time:   [2.7067 ns 2.7238 ns 2.7402 ns]
+propagation_of_cached_label_value__naive/nthreads/4     time:   [62.723 ns 66.214 ns 69.787 ns]
+propagation_of_cached_label_value__naive/nthreads/8     time:   [164.24 ns 170.10 ns 175.68 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [2.2915 ns 2.2960 ns 2.3012 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [2.5726 ns 2.6158 ns 2.6624 ns]
+propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [2.7068 ns 2.8243 ns 2.9824 ns]
+
+*/
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,17 +1,15 @@
-use std::{collections::HashMap, error::Error as _};
+use std::collections::HashMap;
+use std::error::Error as _;

 use bytes::Bytes;
-use reqwest::{IntoUrl, Method, StatusCode};
-
 use detach_ancestor::AncestorDetached;
 use http_utils::error::HttpErrorBody;
-use pageserver_api::{models::*, shard::TenantShardId};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
+use pageserver_api::models::*;
+use pageserver_api::shard::TenantShardId;
 pub use reqwest::Body as ReqwestBody;
+use reqwest::{Certificate, IntoUrl, Method, StatusCode};
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;

 use crate::BlockUnblock;

@@ -40,6 +38,9 @@ pub enum Error {

    #[error("Cancelled")]
    Cancelled,
+
+    #[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
+    CreateClient(reqwest::Error),
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -71,8 +72,17 @@ pub enum ForceAwaitLogicalSize {
 }

 impl Client {
-    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
-        Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
+    pub fn new(
+        mgmt_api_endpoint: String,
+        jwt: Option<&str>,
+        ssl_ca_cert: Option<Certificate>,
+    ) -> Result<Self> {
+        let mut http_client = reqwest::Client::builder();
+        if let Some(ssl_ca_cert) = ssl_ca_cert {
+            http_client = http_client.add_root_certificate(ssl_ca_cert);
+        }
+        let http_client = http_client.build().map_err(Error::CreateClient)?;
+        Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt))
    }

    pub fn from_client(
@@ -103,12 +113,10 @@ impl Client {
        debug_assert!(path.starts_with('/'));
        let uri = format!("{}{}", self.mgmt_api_endpoint, path);

-        let req = self.client.request(Method::GET, uri);
-        let req = if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        };
+        let mut req = self.client.request(Method::GET, uri);
+        if let Some(value) = &self.authorization_header {
+            req = req.header(reqwest::header::AUTHORIZATION, value);
+        }
        req.send().await.map_err(Error::ReceiveBody)
    }

@@ -482,6 +490,7 @@ impl Client {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        concurrency: Option<usize>,
+        recurse: bool,
    ) -> Result<()> {
        let mut path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
@@ -489,6 +498,9 @@ impl Client {
        ))
        .expect("Cannot build URL");

+        path.query_pairs_mut()
+            .append_pair("recurse", &format!("{}", recurse));
+
        if let Some(concurrency) = concurrency {
            path.query_pairs_mut()
                .append_pair("concurrency", &format!("{}", concurrency));
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -1,23 +1,16 @@
 use std::sync::{Arc, Mutex};

-use futures::{
-    stream::{SplitSink, SplitStream},
-    SinkExt, StreamExt,
-};
-use pageserver_api::{
-    models::{
-        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
-        PagestreamGetPageResponse,
-    },
-    reltag::RelTag,
+use futures::stream::{SplitSink, SplitStream};
+use futures::{SinkExt, StreamExt};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
 };
+use pageserver_api::reltag::RelTag;
 use tokio::task::JoinHandle;
 use tokio_postgres::CopyOutStream;
 use tokio_util::sync::CancellationToken;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;

 pub struct Client {
    client: tokio_postgres::Client,
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,11 +1,11 @@
-use clap::{Parser, Subcommand};
-use pageserver_compaction::helpers::PAGE_SZ;
-use pageserver_compaction::simulator::MockTimeline;
-use rand::Rng;
 use std::io::Write;
 use std::path::{Path, PathBuf};
 use std::sync::OnceLock;

+use clap::{Parser, Subcommand};
+use pageserver_compaction::helpers::PAGE_SZ;
+use pageserver_compaction::simulator::MockTimeline;
+use rand::Rng;
 use utils::project_git_version;

 project_git_version!(GIT_VERSION);
@@ -157,8 +157,9 @@ async fn run_suite() -> anyhow::Result<()> {
 use std::fs::File;
 use std::io::Stdout;
 use std::sync::Mutex;
-use tracing_subscriber::fmt::writer::EitherWriter;
+
 use tracing_subscriber::fmt::MakeWriter;
+use tracing_subscriber::fmt::writer::EitherWriter;

 static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
 fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -17,20 +17,19 @@
 //! distance of image layers in LSN dimension is roughly equal to the logical
 //! database size. For example, if the logical database size is 10 GB, we would
 //! generate new image layers every 10 GB of WAL.
-use futures::StreamExt;
-use pageserver_api::shard::ShardIdentity;
-use tracing::{debug, info};
-
 use std::collections::{HashSet, VecDeque};
 use std::ops::Range;

-use crate::helpers::{
-    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
-};
-use crate::interface::*;
+use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
+use tracing::{debug, info};
 use utils::lsn::Lsn;

+use crate::helpers::{
+    PAGE_SZ, accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
+};
 use crate::identify_levels::identify_level;
+use crate::interface::*;

 /// Main entry point to compaction.
 ///
@@ -307,7 +306,7 @@ where
                let mut layer_ids: Vec<LayerId> = Vec::new();
                for layer_id in &job.input_layers {
                    let layer = &self.layers[layer_id.0].layer;
-                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
+                    if let Some(dl) = self.executor.downcast_delta_layer(layer, ctx).await? {
                        deltas.push(dl.clone());
                        layer_ids.push(*layer_id);
                    }
@@ -536,15 +535,16 @@ where
        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
        for layer_id in &job.input_layers {
            let l = &self.layers[layer_id.0];
-            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
+            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer, ctx).await? {
                deltas.push(dl.clone());
            }
        }
        // Open stream
-        let key_value_stream =
-            std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
+        let key_value_stream = std::pin::pin!(
+            merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
                .await?
-                .map(Result::<_, anyhow::Error>::Ok));
+                .map(Result::<_, anyhow::Error>::Ok)
+        );
        let mut new_jobs = Vec::new();

        // Slide a window through the keyspace
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -1,21 +1,21 @@
 //! This file contains generic utility functions over the interface types,
 //! which could be handy for any compaction implementation.
-use crate::interface::*;
+use std::collections::{BinaryHeap, VecDeque};
+use std::fmt::Display;
+use std::future::Future;
+use std::ops::{DerefMut, Range};
+use std::pin::Pin;
+use std::task::{Poll, ready};

 use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
-use std::collections::BinaryHeap;
-use std::collections::VecDeque;
-use std::fmt::Display;
-use std::future::Future;
-use std::ops::{DerefMut, Range};
-use std::pin::Pin;
-use std::task::{ready, Poll};
 use utils::lsn::Lsn;

+use crate::interface::*;
+
 pub const PAGE_SZ: u64 = 8192;

 pub fn keyspace_total_size<K>(
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -26,15 +26,15 @@
 //! file size, the file will still be considered to be part of L0 at the next
 //! iteration.

-use anyhow::bail;
 use std::collections::BTreeSet;
 use std::ops::Range;
+
+use anyhow::bail;
+use tracing::{info, trace};
 use utils::lsn::Lsn;

 use crate::interface::*;

-use tracing::{info, trace};
-
 pub struct Level<L> {
    pub lsn_range: Range<Lsn>,
    pub layers: Vec<L>,
@@ -60,7 +60,11 @@ where
        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
            // shouldn't happen. Indicates that the caller passed a bogus
            // end_lsn.
-            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
+            bail!(
+                "identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}",
+                end_lsn,
+                l.short_id()
+            );
        }
        // include image layers sitting exacty at `end_lsn`.
        let is_image = !l.is_delta();
@@ -246,9 +250,10 @@ impl<L> Level<L> {

 #[cfg(test)]
 mod tests {
+    use std::sync::{Arc, Mutex};
+
    use super::*;
    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
-    use std::sync::{Arc, Mutex};

    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
        MockLayer::Delta(Arc::new(MockDeltaLayer {
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -3,9 +3,12 @@
 //!
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
-use futures::Future;
-use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
 use std::ops::Range;
+
+use futures::Future;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::shard::ShardIdentity;
 use utils::lsn::Lsn;

 /// Public interface. This is the main thing that the implementor needs to provide
@@ -55,6 +58,7 @@ pub trait CompactionJobExecutor {
    fn downcast_delta_layer(
        &self,
        layer: &Self::Layer,
+        ctx: &Self::RequestContext,
    ) -> impl Future<Output = anyhow::Result<Option<Self::DeltaLayer>>> + Send;

    // ----
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -1,22 +1,17 @@
 mod draw;

-use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
+use std::fmt::Write;
+use std::ops::Range;
+use std::sync::{Arc, Mutex};

+use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
 use futures::StreamExt;
 use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;
-
 use utils::lsn::Lsn;

-use std::fmt::Write;
-use std::ops::Range;
-use std::sync::Arc;
-use std::sync::Mutex;
-
-use crate::helpers::PAGE_SZ;
-use crate::helpers::{merge_delta_keys, overlaps_with};
-
+use crate::helpers::{PAGE_SZ, merge_delta_keys, overlaps_with};
 use crate::interface;
 use crate::interface::CompactionLayer;

@@ -487,6 +482,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
    async fn downcast_delta_layer(
        &self,
        layer: &MockLayer,
+        _ctx: &MockRequestContext,
    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
        Ok(match layer {
            MockLayer::Delta(l) => Some(l.clone()),
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -1,14 +1,14 @@
-use super::Key;
-use anyhow::Result;
 use std::cmp::Ordering;
-use std::{
-    collections::{BTreeMap, BTreeSet, HashSet},
-    fmt::Write,
-    ops::Range,
-};
-use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
+use std::collections::{BTreeMap, BTreeSet, HashSet};
+use std::fmt::Write;
+use std::ops::Range;
+
+use anyhow::Result;
+use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, Style, rgb};
 use utils::lsn::Lsn;

+use super::Key;
+
 // Map values to their compressed coordinate - the index the value
 // would have in a sorted and deduplicated list of all values.
 struct CoordinateMap<T: Ord + Copy> {
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -50,18 +50,18 @@
 //! ```
 //!

-use anyhow::{Context, Result};
-use pageserver_api::key::Key;
 use std::cmp::Ordering;
+use std::collections::{BTreeMap, BTreeSet};
 use std::io::{self, BufRead};
+use std::ops::Range;
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::{
-    collections::{BTreeMap, BTreeSet},
-    ops::Range,
-};
-use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
-use utils::{lsn::Lsn, project_git_version};
+
+use anyhow::{Context, Result};
+use pageserver_api::key::Key;
+use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, rectangle, rgb};
+use utils::lsn::Lsn;
+use utils::project_git_version;

 project_git_version!(GIT_VERSION);

--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -1,11 +1,10 @@
+use std::str::FromStr;
+
 use anyhow::Context;
 use clap::Parser;
-use pageserver_api::{
-    key::Key,
-    reltag::{BlockNumber, RelTag, SlruKind},
-    shard::{ShardCount, ShardStripeSize},
-};
-use std::str::FromStr;
+use pageserver_api::key::Key;
+use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::{ShardCount, ShardStripeSize};

 #[derive(Parser)]
 pub(super) struct DescribeKeyCommand {
@@ -394,7 +393,10 @@ mod tests {
    fn single_positional_spanalike_is_key_material() {
        // why is this needed? if you are checking many, then copypaste starts to appeal
        let strings = [
-            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
+            (
+                line!(),
+                "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0",
+            ),
            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
            (line!(), "rel=1663/208101/2620.1 blkno=2"),
        ];
@@ -420,7 +422,15 @@ mod tests {
    #[test]
    fn multiple_spanlike_args() {
        let strings = [
-            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
+            (
+                line!(),
+                &[
+                    "process_query{tenant_id=C",
+                    "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm",
+                    "blkno=2",
+                    "req_lsn=0/238D98C8}",
+                ][..],
+            ),
            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
            (line!(), &["1663/208101/2620_fsm", "2"][..]),
        ];
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -2,27 +2,27 @@
 //!
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

-use anyhow::{anyhow, Result};
-use camino::{Utf8Path, Utf8PathBuf};
-use pageserver::context::{DownloadBehavior, RequestContext};
-use pageserver::task_mgr::TaskKind;
-use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use pageserver::virtual_file::api::IoMode;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::str::FromStr;
 use std::{fs, str};

+use anyhow::{Result, anyhow};
+use camino::{Utf8Path, Utf8PathBuf};
+use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::page_cache::{self, PAGE_SZ};
+use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
-use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
-use pageserver::tenant::storage_layer::{range_overlaps, LayerName};
+use pageserver::tenant::storage_layer::delta_layer::{DELTA_KEY_SIZE, Summary};
+use pageserver::tenant::storage_layer::{LayerName, range_overlaps};
+use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
+use pageserver::virtual_file::api::IoMode;
 use pageserver::virtual_file::{self, VirtualFile};
-use pageserver_api::key::{Key, KEY_SIZE};
-
-use utils::{bin_ser::BeSer, lsn::Lsn};
+use pageserver_api::key::{KEY_SIZE, Key};
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;

 use crate::AnalyzeLayerMapCmd;

--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -1,3 +1,4 @@
+use std::fs::{self, File};
 use std::path::{Path, PathBuf};

 use anyhow::Result;
@@ -5,12 +6,10 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
-use pageserver::tenant::storage_layer::{delta_layer, image_layer};
-use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
+use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, image_layer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
-use std::fs::{self, File};
 use utils::id::{TenantId, TimelineId};

 use crate::layer_map_analyzer::parse_filename;
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -11,33 +11,29 @@ mod layer_map_analyzer;
 mod layers;
 mod page_trace;

-use page_trace::PageTraceCmd;
-use std::{
-    str::FromStr,
-    time::{Duration, SystemTime},
-};
+use std::str::FromStr;
+use std::time::{Duration, SystemTime};

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
-use pageserver::{
-    context::{DownloadBehavior, RequestContext},
-    page_cache,
-    task_mgr::TaskKind,
-    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file::{self, api::IoMode},
-};
+use page_trace::PageTraceCmd;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::page_cache;
+use pageserver::task_mgr::TaskKind;
+use pageserver::tenant::dump_layerfile_from_path;
+use pageserver::tenant::metadata::TimelineMetadata;
+use pageserver::virtual_file::api::IoMode;
+use pageserver::virtual_file::{self};
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
-use utils::{
-    id::TimelineId,
-    logging::{self, LogFormat, TracingErrorLayerEnablement},
-    lsn::Lsn,
-    project_git_version,
-};
+use utils::id::TimelineId;
+use utils::logging::{self, LogFormat, TracingErrorLayerEnablement};
+use utils::lsn::Lsn;
+use utils::project_git_version;

 project_git_version!(GIT_VERSION);

@@ -355,7 +351,9 @@ mod tests {
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
-        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
+        assert_valid(
+            "pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683",
+        );
        assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
    }
 }
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -15,6 +15,7 @@ hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
 rand.workspace = true
+reqwest.workspace=true
 serde.workspace = true
 serde_json.workspace = true
 tracing.workspace = true
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -1,12 +1,12 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Instant;
+
 use pageserver_api::models::{TenantConfig, TenantConfigRequest};
 use pageserver_api::shard::TenantShardId;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

-use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::Instant;
-
 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
@@ -36,7 +36,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        args.mgmt_api_endpoint.clone(),
        args.pageserver_jwt.as_deref(),
-    ));
+        None, // TODO: support ssl_ca_file for https APIs in pagebench.
+    )?);

    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,16 +1,3 @@
-use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
-use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
-use pageserver_client::page_service::BasebackupRequest;
-
-use utils::id::TenantTimelineId;
-use utils::lsn::Lsn;
-
-use rand::prelude::*;
-use tokio::sync::Barrier;
-use tokio::task::JoinSet;
-use tracing::{info, instrument};
-
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::ops::Range;
@@ -18,6 +5,17 @@ use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Instant;

+use anyhow::Context;
+use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
+use pageserver_client::page_service::BasebackupRequest;
+use rand::prelude::*;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use tracing::{info, instrument};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

@@ -79,7 +77,8 @@ async fn main_impl(
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        args.mgmt_api_endpoint.clone(),
        args.pageserver_jwt.as_deref(),
-    ));
+        None, // TODO: support ssl_ca_file for https APIs in pagebench.
+    )?);

    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,18 +1,3 @@
-use anyhow::Context;
-use camino::Utf8PathBuf;
-use pageserver_api::key::Key;
-use pageserver_api::keyspace::KeySpaceAccum;
-use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
-
-use pageserver_api::shard::TenantShardId;
-use tokio_util::sync::CancellationToken;
-use utils::id::TenantTimelineId;
-use utils::lsn::Lsn;
-
-use rand::prelude::*;
-use tokio::task::JoinSet;
-use tracing::info;
-
 use std::collections::{HashSet, VecDeque};
 use std::future::Future;
 use std::num::NonZeroUsize;
@@ -21,6 +6,19 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

+use anyhow::Context;
+use camino::Utf8PathBuf;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpaceAccum;
+use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
+use pageserver_api::shard::TenantShardId;
+use rand::prelude::*;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

@@ -127,7 +125,8 @@ async fn main_impl(
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        args.mgmt_api_endpoint.clone(),
        args.pageserver_jwt.as_deref(),
-    ));
+        None, // TODO: support ssl_ca_file for https APIs in pagebench.
+    )?);

    if let Some(engine_str) = &args.set_io_engine {
        mgmt_api_client.put_io_engine(engine_str).await?;
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -1,23 +1,19 @@
-use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
+use std::f64;
+use std::num::NonZeroUsize;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::{Duration, Instant};

+use pageserver_api::models::HistoricLayerInfo;
+use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
+use tokio::sync::{OwnedSemaphorePermit, mpsc};
+use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};

-use std::{f64, sync::Arc};
-use tokio::{
-    sync::{mpsc, OwnedSemaphorePermit},
-    task::JoinSet,
-};
-
-use std::{
-    num::NonZeroUsize,
-    sync::atomic::{AtomicU64, Ordering},
-    time::{Duration, Instant},
-};
-
 /// Evict & on-demand download random layers.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
@@ -87,7 +83,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        args.mgmt_api_endpoint.clone(),
        args.pageserver_jwt.as_deref(),
-    ));
+        None, // TODO: support ssl_ca_file for https APIs in pagebench.
+    )?);

    if let Some(engine_str) = &args.set_io_engine {
        mgmt_api_client.put_io_engine(engine_str).await?;
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -2,11 +2,10 @@ use std::sync::Arc;

 use humantime::Duration;
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;

-use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
-
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
@@ -41,7 +40,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        args.mgmt_api_endpoint.clone(),
        args.pageserver_jwt.as_deref(),
-    ));
+        None, // TODO: support ssl_ca_file for https APIs in pagebench.
+    )?);

    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,8 +33,9 @@ use utils::lsn::Lsn;

 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
-use crate::tenant::Timeline;
 use crate::tenant::storage_layer::IoConcurrency;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};

 #[derive(Debug, thiserror::Error)]
 pub enum BasebackupError {
@@ -42,6 +43,26 @@ pub enum BasebackupError {
    Server(#[from] anyhow::Error),
    #[error("basebackup client error {0:#} when {1}")]
    Client(#[source] io::Error, &'static str),
+    #[error("basebackup during shutdown")]
+    Shutdown,
+}
+
+impl From<PageReconstructError> for BasebackupError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
+}
+
+impl From<GetVectoredError> for BasebackupError {
+    fn from(value: GetVectoredError) -> Self {
+        match value {
+            GetVectoredError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
 }

 /// Create basebackup with non-rel data in it.
@@ -127,7 +148,7 @@ where
            timeline
                .gate
                .enter()
-                .map_err(|e| BasebackupError::Server(e.into()))?,
+                .map_err(|_| BasebackupError::Shutdown)?,
        ),
    };
    basebackup
@@ -323,8 +344,7 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
+                .await?
                .partition(
                    self.timeline.get_shard_identity(),
                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -336,11 +356,10 @@ where
                let blocks = self
                    .timeline
                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;

                for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    let block = block?;
                    slru_builder.add_block(&key, block).await?;
                }
            }
@@ -349,11 +368,8 @@ where

        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;

@@ -362,8 +378,7 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -391,8 +406,7 @@ where
        let aux_files = self
            .timeline
            .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
        let aux_scan_time = start_time.elapsed();
        let aux_estimated_size = aux_files
            .values()
@@ -451,16 +465,14 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
        {
            self.add_twophase_file(xid).await?;
        }
        let repl_origins = self
            .timeline
            .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
        let n_origins = repl_origins.len();
        if n_origins != 0 {
            //
@@ -505,8 +517,7 @@ where
        let nblocks = self
            .timeline
            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        // If the relation is empty, create an empty file
        if nblocks == 0 {
@@ -532,8 +543,7 @@ where
                    // TODO: investigate using get_vectored for the entire startblk..endblk range.
                    // But this code path is not on the critical path for most basebackups (?).
                    .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }

@@ -567,8 +577,7 @@ where
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;

            if img.len()
                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -622,8 +631,7 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                    .is_empty()
            {
                return Ok(());
@@ -674,8 +682,7 @@ where
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,6 +14,7 @@ use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
+use nix::sys::socket::{setsockopt, sockopt};
 use pageserver::config::{PageServerConf, PageserverIdentity};
 use pageserver::controller_upcall_client::ControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
@@ -24,11 +25,12 @@ use pageserver::task_mgr::{
 };
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service,
-    task_mgr, virtual_file,
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
+    page_cache, page_service, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
+use rustls_pki_types::{CertificateDer, PrivateKeyDer};
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
@@ -342,11 +344,25 @@ fn start_pageserver(
    info!("Starting pageserver http handler on {http_addr}");
    let http_listener = tcp_listener::bind(http_addr)?;

-    let pg_addr = &conf.listen_pg_addr;
+    let https_listener = match conf.listen_https_addr.as_ref() {
+        Some(https_addr) => {
+            info!("Starting pageserver https handler on {https_addr}");
+            Some(tcp_listener::bind(https_addr)?)
+        }
+        None => None,
+    };

+    let pg_addr = &conf.listen_pg_addr;
    info!("Starting pageserver pg protocol handler on {pg_addr}");
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

+    // Enable SO_KEEPALIVE on the socket, to detect dead connections faster.
+    // These are configured via net.ipv4.tcp_keepalive_* sysctls.
+    //
+    // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't
+    // support enabling keepalives while using the default OS sysctls.
+    setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?;
+
    // Launch broker client
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME
@@ -567,9 +583,8 @@ fn start_pageserver(

    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
-    let http_endpoint_listener = {
+    let (http_endpoint_listener, https_endpoint_listener) = {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
-        let cancel = CancellationToken::new();

        let router_state = Arc::new(
            http::routes::State::new(
@@ -584,22 +599,51 @@ fn start_pageserver(
            )
            .context("Failed to initialize router state")?,
        );
+
        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
            .build()
            .map_err(|err| anyhow!(err))?;
-        let service = http_utils::RouterService::new(router).unwrap();
-        let server = hyper0::Server::from_tcp(http_listener)?
-            .serve(service)
-            .with_graceful_shutdown({
-                let cancel = cancel.clone();
-                async move { cancel.clone().cancelled().await }
-            });

-        let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "http endpoint listener",
-            server,
-        ));
-        HttpEndpointListener(CancellableTask { task, cancel })
+        let service =
+            Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?);
+
+        let http_task = {
+            let server =
+                http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?;
+            let cancel = CancellationToken::new();
+
+            let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+                "http endpoint listener",
+                server.serve(cancel.clone()),
+            ));
+            HttpEndpointListener(CancellableTask { task, cancel })
+        };
+
+        let https_task = match https_listener {
+            Some(https_listener) => {
+                let certs = load_certs(&conf.ssl_cert_file)?;
+                let key = load_private_key(&conf.ssl_key_file)?;
+
+                let server_config = rustls::ServerConfig::builder()
+                    .with_no_client_auth()
+                    .with_single_cert(certs, key)?;
+
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+
+                let server =
+                    http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
+                let cancel = CancellationToken::new();
+
+                let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+                    "https endpoint listener",
+                    server.serve(cancel.clone()),
+                ));
+                Some(HttpsEndpointListener(CancellableTask { task, cancel }))
+            }
+            None => None,
+        };
+
+        (http_task, https_task)
    };

    let consumption_metrics_tasks = {
@@ -675,6 +719,7 @@ fn start_pageserver(
        shutdown_pageserver.cancel();
        pageserver::shutdown_pageserver(
            http_endpoint_listener,
+            https_endpoint_listener,
            page_service,
            consumption_metrics_tasks,
            disk_usage_eviction_task,
@@ -689,6 +734,25 @@ fn start_pageserver(
    })
 }

+fn load_certs(filename: &Utf8Path) -> std::io::Result<Vec<CertificateDer<'static>>> {
+    let file = std::fs::File::open(filename)?;
+    let mut reader = std::io::BufReader::new(file);
+
+    rustls_pemfile::certs(&mut reader).collect()
+}
+
+fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
+    let file = std::fs::File::open(filename)?;
+    let mut reader = std::io::BufReader::new(file);
+
+    let key = rustls_pemfile::private_key(&mut reader)?;
+
+    key.ok_or(anyhow::anyhow!(
+        "no private key found in {}",
+        filename.as_str(),
+    ))
+}
+
 async fn create_remote_storage_client(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -53,6 +53,11 @@ pub struct PageServerConf {
    pub listen_pg_addr: String,
    /// Example (default): 127.0.0.1:9898
    pub listen_http_addr: String,
+    /// Example: 127.0.0.1:9899
+    pub listen_https_addr: Option<String>,
+
+    pub ssl_key_file: Utf8PathBuf,
+    pub ssl_cert_file: Utf8PathBuf,

    /// Current availability zone. Used for traffic metrics.
    pub availability_zone: Option<String>,
@@ -194,6 +199,13 @@ pub struct PageServerConf {
    /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
    /// safekeepers does not have gaps.
    pub validate_wal_contiguity: bool,
+
+    /// When set, the previously written to disk heatmap is loaded on tenant attach and used
+    /// to avoid clobbering the heatmap from new, cold, attached locations.
+    pub load_previous_heatmap: bool,
+
+    /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
+    pub generate_unarchival_heatmap: bool,
 }

 /// Token for authentication to safekeepers
@@ -310,6 +322,9 @@ impl PageServerConf {
        let pageserver_api::config::ConfigToml {
            listen_pg_addr,
            listen_http_addr,
+            listen_https_addr,
+            ssl_key_file,
+            ssl_cert_file,
            availability_zone,
            wait_lsn_timeout,
            wal_redo_timeout,
@@ -358,6 +373,8 @@ impl PageServerConf {
            get_vectored_concurrent_io,
            enable_read_path_debugging,
            validate_wal_contiguity,
+            load_previous_heatmap,
+            generate_unarchival_heatmap,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -366,6 +383,9 @@ impl PageServerConf {
            // ------------------------------------------------------------
            listen_pg_addr,
            listen_http_addr,
+            listen_https_addr,
+            ssl_key_file,
+            ssl_cert_file,
            availability_zone,
            wait_lsn_timeout,
            wal_redo_timeout,
@@ -447,6 +467,8 @@ impl PageServerConf {
            no_sync: no_sync.unwrap_or(false),
            enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
            validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
+            load_previous_heatmap: load_previous_heatmap.unwrap_or(true),
+            generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true),
        };

        // ------------------------------------------------------------
@@ -480,7 +502,9 @@ impl PageServerConf {
    #[cfg(test)]
    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
        let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
-        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
+
+        let test_id = uuid::Uuid::new_v4();
+        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
    }

    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -493,6 +517,8 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            synthetic_size_calculation_interval: Duration::from_secs(60),
            background_task_maximum_delay: Duration::ZERO,
+            load_previous_heatmap: Some(true),
+            generate_unarchival_heatmap: Some(true),
            ..Default::default()
        };
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -89,16 +89,112 @@
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.

-use crate::task_mgr::TaskKind;
+use std::sync::Arc;
+
+use once_cell::sync::Lazy;
+use tracing::warn;
+use utils::{id::TimelineId, shard::TenantShardId};
+
+use crate::{
+    metrics::{StorageIoSizeMetrics, TimelineMetrics},
+    task_mgr::TaskKind,
+    tenant::Timeline,
+};

 // The main structure of this module, see module-level comment.
-#[derive(Debug)]
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
    page_content_kind: PageContentKind,
    read_path_debug: bool,
+    scope: Scope,
+}
+
+#[derive(Clone)]
+pub(crate) enum Scope {
+    Global {
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+    },
+    SecondaryTenant {
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+    },
+    SecondaryTimeline {
+        io_size_metrics: crate::metrics::StorageIoSizeMetrics,
+    },
+    Timeline {
+        // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
+        // context creation contending for the ref counters of the Arc<TimelineMetrics>,
+        // which are shared among all tasks that operate on the timeline, especially
+        // concurrent page_service connections.
+        #[allow(clippy::redundant_allocation)]
+        arc_arc: Arc<Arc<TimelineMetrics>>,
+    },
+    #[cfg(test)]
+    UnitTest {
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+    },
+}
+
+static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
+    Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));
+
+impl Scope {
+    pub(crate) fn new_global() -> Self {
+        Scope::Global {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+        }
+    }
+    /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
+    /// of a compaction iteration.
+    pub(crate) fn new_timeline(timeline: &Timeline) -> Self {
+        Scope::Timeline {
+            arc_arc: Arc::new(Arc::clone(&timeline.metrics)),
+        }
+    }
+    pub(crate) fn new_page_service_pagestream(
+        timeline_handle: &crate::tenant::timeline::handle::Handle<
+            crate::page_service::TenantManagerTypes,
+        >,
+    ) -> Self {
+        Scope::Timeline {
+            arc_arc: Arc::clone(&timeline_handle.metrics),
+        }
+    }
+    pub(crate) fn new_secondary_timeline(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Self {
+        // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.
+
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = tenant_shard_id.shard_slug().to_string();
+        let timeline_id = timeline_id.to_string();
+
+        let io_size_metrics =
+            crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
+        Scope::SecondaryTimeline { io_size_metrics }
+    }
+    pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
+        // Before propagating metrics via RequestContext, the labels were inferred from file path.
+        // The only user of VirtualFile at tenant scope is the heatmap download & read.
+        // The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*).
+        // Thus, we do the same here, and extend that for anything secondary-tenant scoped.
+        //
+        // If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future,
+        // we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown,
+        // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
+        // at this point, so, we were able to completely side-step tenant-scoped stuff there).
+        Scope::SecondaryTenant {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+        }
+    }
+    #[cfg(test)]
+    pub(crate) fn new_unit_test() -> Self {
+        Scope::UnitTest {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+        }
+    }
 }

 /// The kind of access to the page cache.
@@ -157,6 +253,7 @@ impl RequestContextBuilder {
                access_stats_behavior: AccessStatsBehavior::Update,
                page_content_kind: PageContentKind::Unknown,
                read_path_debug: false,
+                scope: Scope::new_global(),
            },
        }
    }
@@ -171,10 +268,16 @@ impl RequestContextBuilder {
                access_stats_behavior: original.access_stats_behavior,
                page_content_kind: original.page_content_kind,
                read_path_debug: original.read_path_debug,
+                scope: original.scope.clone(),
            },
        }
    }

+    pub fn task_kind(mut self, k: TaskKind) -> Self {
+        self.inner.task_kind = k;
+        self
+    }
+
    /// Configure the DownloadBehavior of the context: whether to
    /// download missing layers, and/or warn on the download.
    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
@@ -199,6 +302,11 @@ impl RequestContextBuilder {
        self
    }

+    pub(crate) fn scope(mut self, s: Scope) -> Self {
+        self.inner.scope = s;
+        self
+    }
+
    pub fn build(self) -> RequestContext {
        self.inner
    }
@@ -281,7 +389,50 @@ impl RequestContext {
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        Self::new(task_kind, download_behavior)
+        RequestContextBuilder::extend(self)
+            .task_kind(task_kind)
+            .download_behavior(download_behavior)
+            .build()
+    }
+
+    pub fn with_scope_timeline(&self, timeline: &Arc<Timeline>) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_timeline(timeline))
+            .build()
+    }
+
+    pub(crate) fn with_scope_page_service_pagestream(
+        &self,
+        timeline_handle: &crate::tenant::timeline::handle::Handle<
+            crate::page_service::TenantManagerTypes,
+        >,
+    ) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_page_service_pagestream(timeline_handle))
+            .build()
+    }
+
+    pub fn with_scope_secondary_timeline(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id))
+            .build()
+    }
+
+    pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self {
+        RequestContextBuilder::extend(self)
+            .scope(Scope::new_secondary_tenant(tenant_shard_id))
+            .build()
+    }
+
+    #[cfg(test)]
+    pub fn with_scope_unit_test(&self) -> Self {
+        RequestContextBuilder::new(TaskKind::UnitTest)
+            .scope(Scope::new_unit_test())
+            .build()
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -303,4 +454,38 @@ impl RequestContext {
    pub(crate) fn read_path_debug(&self) -> bool {
        self.read_path_debug
    }
+
+    pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
+        match &self.scope {
+            Scope::Global { io_size_metrics } => {
+                let is_unit_test = cfg!(test);
+                let is_regress_test_build = cfg!(feature = "testing");
+                if is_unit_test || is_regress_test_build {
+                    panic!("all VirtualFile instances are timeline-scoped");
+                } else {
+                    use once_cell::sync::Lazy;
+                    use std::sync::Mutex;
+                    use std::time::Duration;
+                    use utils::rate_limit::RateLimit;
+                    static LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
+                    let mut guard = LIMIT.lock().unwrap();
+                    guard.call2(|rate_limit_stats| {
+                        warn!(
+                            %rate_limit_stats,
+                            backtrace=%std::backtrace::Backtrace::force_capture(),
+                            "all VirtualFile instances are timeline-scoped",
+                        );
+                    });
+
+                    io_size_metrics
+                }
+            }
+            Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
+            Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
+            Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
+            #[cfg(test)]
+            Scope::UnitTest { io_size_metrics } => io_size_metrics,
+        }
+    }
 }
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -84,6 +84,7 @@ impl ControllerUpcallClient {
        })
    }

+    #[tracing::instrument(skip_all)]
    async fn retry_http_forever<R, T>(
        &self,
        url: &url::Url,
@@ -108,7 +109,7 @@ impl ControllerUpcallClient {
            |_| false,
            3,
            u32::MAX,
-            "calling control plane generation validation API",
+            "storage controller upcall",
            &self.cancel,
        )
        .await
@@ -125,11 +126,12 @@ impl ControllerUpcallClient {

 impl ControlPlaneGenerationsApi for ControllerUpcallClient {
    /// Block until we get a successful response, or error out if we are shut down
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
    async fn re_attach(
        &self,
        conf: &PageServerConf,
    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
-        let re_attach_path = self
+        let url = self
            .base_url
            .join("re-attach")
            .expect("Failed to build re-attach path");
@@ -179,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
-                        listen_https_port: None, // TODO: Support https.
+                        listen_https_port: m.https_port,
                        availability_zone_id: az_id.expect("Checked above"),
                    })
                }
@@ -205,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
            register: register.clone(),
        };

-        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+        let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants (node {}, register: {:?})",
            response.tenants.len(),
@@ -223,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
    }

    /// Block until we get a successful response, or error out if we are shut down
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
    async fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
-        let re_attach_path = self
+        let url = self
            .base_url
            .join("validate")
            .expect("Failed to build validate path");
@@ -257,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                return Err(RetryForeverError::ShuttingDown);
            }

-            let response: ValidateResponse =
-                self.retry_http_forever(&re_attach_path, request).await?;
+            let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
            for rt in response.tenants {
                result.insert(rt.id, rt.valid);
            }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -842,6 +842,12 @@ paths:
        required: false
        schema:
          type: integer
+      - name: recurse
+        description: When set, will recurse with the downloads into ancestor timelines
+        in: query
+        required: false
+        schema:
+          type: boolean
    post:
      description: |
        Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -37,7 +37,8 @@ use pageserver_api::models::{
    TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
    TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
    TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
-    TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
+    TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
+    TopTenantShardsRequest, TopTenantShardsResponse,
 };
 use pageserver_api::shard::{ShardCount, TenantShardId};
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
@@ -54,6 +55,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use crate::config::PageServerConf;
+use crate::context;
 use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -63,6 +65,7 @@ use crate::tenant::mgr::{
    GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
 };
+use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::{
    download_index_part, list_remote_tenant_shards, list_remote_timelines,
 };
@@ -481,6 +484,7 @@ async fn build_timeline_info_common(

        state,
        is_archived: Some(is_archived),
+        rel_size_migration: Some(timeline.get_rel_size_v2_status()),

        walreceiver_status,
    };
@@ -857,6 +861,75 @@ async fn timeline_archival_config_handler(
    json_response(StatusCode::OK, ())
 }

+/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
+/// measure only.
+///
+/// Some examples of safe patches:
+/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
+/// - Force set the index part to use reldir v2 (migrating/migrated).
+///
+/// Some examples of unsafe patches:
+/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
+///   errors.
+/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
+async fn timeline_patch_index_part_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
+    check_permission(&request, None)?; // require global permission for this request
+    let state = get_state(&request);
+
+    async {
+        let timeline =
+            active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+                .await?;
+
+        if let Some(rel_size_migration) = request_data.rel_size_migration {
+            timeline
+                .update_rel_size_v2_status(rel_size_migration)
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(gc_compaction_last_completed_lsn) =
+            request_data.gc_compaction_last_completed_lsn
+        {
+            timeline
+                .update_gc_compaction_state(GcCompactionState {
+                    last_completed_lsn: gc_compaction_last_completed_lsn,
+                })
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
+            {
+                let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
+                guard.store_and_unlock(applied_gc_cutoff_lsn);
+            }
+        }
+
+        if request_data.force_index_update {
+            timeline
+                .remote_client
+                .force_schedule_index_upload()
+                .context("force schedule index upload")
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_patch_index_part",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -881,12 +954,13 @@ async fn timeline_detail_handler(
        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

        let timeline = tenant.get_timeline(timeline_id, false)?;
+        let ctx = &ctx.with_scope_timeline(&timeline);

        let timeline_info = build_timeline_info(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
            force_await_initial_logical_size.unwrap_or(false),
-            &ctx,
+            ctx,
        )
        .await
        .context("get local timeline info")
@@ -927,11 +1001,11 @@ async fn get_lsn_by_timestamp_handler(

    let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
@@ -1000,10 +1074,11 @@ async fn get_timestamp_of_lsn_handler(
        .with_context(|| format!("Invalid LSN: {lsn_str:?}"))
        .map_err(ApiError::BadRequest)?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
@@ -1358,7 +1433,8 @@ async fn timeline_layer_scan_disposable_keys(
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);

    let guard = timeline.layers.read().await;
    let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
@@ -1368,7 +1444,7 @@ async fn timeline_layer_scan_disposable_keys(
    };

    let resident_layer = layer
-        .download_and_keep_resident()
+        .download_and_keep_resident(&ctx)
        .await
        .map_err(|err| match err {
            tenant::storage_layer::layer::DownloadError::TimelineShutdown
@@ -1436,6 +1512,7 @@ async fn timeline_download_heatmap_layers_handler(

    let desired_concurrency =
        parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
+    let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false);

    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

@@ -1443,6 +1520,8 @@ async fn timeline_download_heatmap_layers_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);

    let max_concurrency = get_config(&request)
        .remote_storage_config
@@ -1451,7 +1530,7 @@ async fn timeline_download_heatmap_layers_handler(
        .unwrap_or(DEFAULT_MAX_CONCURRENCY);
    let concurrency = std::cmp::min(max_concurrency, desired_concurrency);

-    timeline.start_heatmap_layers_download(concurrency).await?;
+    timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?;

    json_response(StatusCode::ACCEPTED, ())
 }
@@ -1490,8 +1569,10 @@ async fn layer_download_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let downloaded = timeline
-        .download_layer(&layer_name)
+        .download_layer(&layer_name, &ctx)
        .await
        .map_err(|e| match e {
            tenant::storage_layer::layer::DownloadError::TimelineShutdown
@@ -2225,8 +2306,8 @@ async fn timeline_compact_handler(
        .unwrap_or(false);

    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
        if scheduled {
            let tenant = state
                .tenant_manager
@@ -2333,8 +2414,8 @@ async fn timeline_checkpoint_handler(
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
        if wait_until_flushed {
            timeline.freeze_and_flush().await
        } else {
@@ -2389,7 +2470,9 @@ async fn timeline_download_remote_layers_handler_post(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
-    match timeline.spawn_download_all_remote_layers(body).await {
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
+    match timeline.spawn_download_all_remote_layers(body, &ctx).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
    }
@@ -2471,6 +2554,7 @@ async fn timeline_detach_ancestor_handler(
        tracing::info!("all timeline upload queues are drained");

        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let ctx = &ctx.with_scope_timeline(&timeline);

        let progress = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
@@ -2577,8 +2661,9 @@ async fn getpage_at_lsn_handler_inner(
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        // Enable read path debugging
-        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true)
+        .scope(context::Scope::new_timeline(&timeline)).build();

        // Use last_record_lsn if no lsn is provided
        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
@@ -2612,8 +2697,8 @@ async fn timeline_collect_keyspace(
    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;

    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let (dense_ks, sparse_ks) = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -3250,7 +3335,7 @@ async fn put_tenant_timeline_import_basebackup(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        let timeline = tenant
+        let (timeline, timeline_ctx) = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .map_err(ApiError::InternalServerError)
            .await?;
@@ -3269,7 +3354,13 @@ async fn put_tenant_timeline_import_basebackup(
        info!("importing basebackup");

        timeline
-            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
+            .import_basebackup_from_tar(
+                tenant.clone(),
+                &mut body,
+                base_lsn,
+                broker_client,
+                &timeline_ctx,
+            )
            .await
            .map_err(ApiError::InternalServerError)?;

@@ -3309,6 +3400,7 @@ async fn put_tenant_timeline_import_wal(
        let state = get_state(&request);

        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
+        let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build();

        let mut body = StreamReader::new(request.into_body().map(|res| {
            res.map_err(|error| {
@@ -3625,6 +3717,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
+            |r| api_handler(r, timeline_patch_index_part_handler),
+        )
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
            |r| api_handler(r, lsn_lease_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -64,6 +64,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct HttpsEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,6 +78,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
+    https_listener: Option<HttpsEndpointListener>,
    page_service: page_service::Listener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
@@ -213,6 +215,15 @@ pub async fn shutdown_pageserver(
    )
    .await;

+    if let Some(https_listener) = https_listener {
+        timed(
+            https_listener.0.shutdown(),
+            "shutdown https",
+            Duration::from_secs(1),
+        )
+        .await;
+    }
+
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_batch_global",
+        "Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_amortized_global",
+        "Layers visited to serve a single read (read amplification). Amortized across a batch: \
+            all visited layers are divided by number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
    register_histogram!(
@@ -1204,11 +1227,24 @@ impl StorageIoTime {

 pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);

-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+#[derive(Clone, Copy)]
+#[repr(usize)]
+enum StorageIoSizeOperation {
+    Read,
+    Write,
+}
+
+impl StorageIoSizeOperation {
+    const VARIANTS: &'static [&'static str] = &["read", "write"];
+
+    fn as_str(&self) -> &'static str {
+        Self::VARIANTS[*self as usize]
+    }
+}

 // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
+static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
        &["operation", "tenant_id", "shard_id", "timeline_id"]
@@ -1216,6 +1252,34 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+#[derive(Clone, Debug)]
+pub(crate) struct StorageIoSizeMetrics {
+    pub read: UIntGauge,
+    pub write: UIntGauge,
+}
+
+impl StorageIoSizeMetrics {
+    pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
+        let read = STORAGE_IO_SIZE
+            .get_metric_with_label_values(&[
+                StorageIoSizeOperation::Read.as_str(),
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ])
+            .unwrap();
+        let write = STORAGE_IO_SIZE
+            .get_metric_with_label_values(&[
+                StorageIoSizeOperation::Write.as_str(),
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ])
+            .unwrap();
+        Self { read, write }
+    }
+}
+
 #[cfg(not(test))]
 pub(crate) mod virtual_file_descriptor_cache {
    use super::*;
@@ -2798,6 +2862,7 @@ pub(crate) struct TimelineMetrics {
    /// Number of valid LSN leases.
    pub valid_lsn_lease_count_gauge: UIntGauge,
    pub wal_records_received: IntCounter,
+    pub storage_io_size: StorageIoSizeMetrics,
    shutdown: std::sync::atomic::AtomicBool,
 }

@@ -2933,6 +2998,8 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

+        let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
+
        TimelineMetrics {
            tenant_id,
            shard_id,
@@ -2962,6 +3029,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            storage_io_size,
            valid_lsn_lease_count_gauge,
            wal_records_received,
            shutdown: std::sync::atomic::AtomicBool::default(),
@@ -3152,7 +3220,7 @@ impl TimelineMetrics {
            ]);
        }

-        for op in STORAGE_IO_SIZE_OPERATIONS {
+        for op in StorageIoSizeOperation::VARIANTS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

@@ -4074,6 +4142,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
    // histograms
    [
        &LAYERS_PER_READ_GLOBAL,
+        &LAYERS_PER_READ_BATCH_GLOBAL,
+        &LAYERS_PER_READ_AMORTIZED_GLOBAL,
        &DELTAS_PER_READ_GLOBAL,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -56,6 +56,7 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
+    TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -392,10 +393,6 @@ impl TimelineHandles {
            .await
            .map_err(|e| match e {
                timeline::handle::GetError::TenantManager(e) => e,
-                timeline::handle::GetError::TimelineGateClosed => {
-                    trace!("timeline gate closed");
-                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
-                }
                timeline::handle::GetError::PerTimelineStateShutDown => {
                    trace!("per-timeline state shut down");
                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
@@ -422,24 +419,36 @@ pub(crate) struct TenantManagerTypes;
 impl timeline::handle::Types for TenantManagerTypes {
    type TenantManagerError = GetActiveTimelineError;
    type TenantManager = TenantManagerWrapper;
-    type Timeline = Arc<Timeline>;
+    type Timeline = TenantManagerCacheItem;
 }

-impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
-    fn gate(&self) -> &utils::sync::gate::Gate {
-        &self.gate
-    }
+pub(crate) struct TenantManagerCacheItem {
+    pub(crate) timeline: Arc<Timeline>,
+    // allow() for cheap propagation through RequestContext inside a task
+    #[allow(clippy::redundant_allocation)]
+    pub(crate) metrics: Arc<Arc<TimelineMetrics>>,
+    #[allow(dead_code)] // we store it to keep the gate open
+    pub(crate) gate_guard: GateGuard,
+}

+impl std::ops::Deref for TenantManagerCacheItem {
+    type Target = Arc<Timeline>;
+    fn deref(&self) -> &Self::Target {
+        &self.timeline
+    }
+}
+
+impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
    fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
-        Timeline::shard_timeline_id(self)
+        Timeline::shard_timeline_id(&self.timeline)
    }

    fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
-        &self.handles
+        &self.timeline.handles
    }

    fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
-        Timeline::get_shard_identity(self)
+        Timeline::get_shard_identity(&self.timeline)
    }
 }

@@ -448,7 +457,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        &self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    ) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
        let tenant_id = self.tenant_id.get().expect("we set this in get()");
        let timeout = ACTIVE_TENANT_TIMEOUT;
        let wait_start = Instant::now();
@@ -491,7 +500,23 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        let timeline = tenant_shard
            .get_timeline(timeline_id, true)
            .map_err(GetActiveTimelineError::Timeline)?;
-        Ok(timeline)
+
+        let gate_guard = match timeline.gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => {
+                return Err(GetActiveTimelineError::Timeline(
+                    GetTimelineError::ShuttingDown,
+                ));
+            }
+        };
+
+        let metrics = Arc::new(Arc::clone(&timeline.metrics));
+
+        Ok(TenantManagerCacheItem {
+            timeline,
+            metrics,
+            gate_guard,
+        })
    }
 }

@@ -1220,6 +1245,14 @@ impl PageServerHandler {
        ),
        QueryError,
    > {
+        macro_rules! upgrade_handle_and_set_context {
+            ($shard:ident) => {{
+                let weak_handle = &$shard;
+                let handle = weak_handle.upgrade()?;
+                let ctx = ctx.with_scope_page_service_pagestream(&handle);
+                (handle, ctx)
+            }};
+        }
        Ok(match batch {
            BatchedFeMessage::Exists {
                span,
@@ -1228,9 +1261,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::exists");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_get_rel_exists_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1246,9 +1280,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::nblocks");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_get_nblocks_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1264,17 +1299,18 @@ impl PageServerHandler {
                pages,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::getpage");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    {
                        let npages = pages.len();
                        trace!(npages, "handling getpage request");
                        let res = self
                            .handle_get_page_at_lsn_request_batched(
-                                &*shard.upgrade()?,
+                                &shard,
                                effective_request_lsn,
                                pages,
                                io_concurrency,
-                                ctx,
+                                &ctx,
                            )
                            .instrument(span.clone())
                            .await;
@@ -1291,9 +1327,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::dbsize");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_db_size_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1309,9 +1346,10 @@ impl PageServerHandler {
                req,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    vec![
-                        self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
+                        self.handle_get_slru_segment_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
                            .map(|msg| (msg, timer))
@@ -1327,12 +1365,13 @@ impl PageServerHandler {
                requests,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::test");
+                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                (
                    {
                        let npages = requests.len();
                        trace!(npages, "handling getpage request");
                        let res = self
-                            .handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
+                            .handle_test_request_batch(&shard, requests, &ctx)
                            .instrument(span.clone())
                            .await;
                        assert_eq!(res.len(), npages);
@@ -2095,6 +2134,7 @@ impl PageServerHandler {
                // TODO: passthrough the error site to the final error message?
                BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
                BasebackupError::Server(e) => QueryError::Other(e),
+                BasebackupError::Shutdown => QueryError::Shutdown,
            }
        }

@@ -2107,6 +2147,7 @@ impl PageServerHandler {
            .get(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
        set_tracing_field_shard_id(&timeline);
+        let ctx = ctx.with_scope_timeline(&timeline);

        if timeline.is_archived() == Some(true) {
            tracing::info!(
@@ -2124,7 +2165,7 @@ impl PageServerHandler {
                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    crate::tenant::timeline::WaitLsnTimeout::Default,
-                    ctx,
+                    &ctx,
                )
                .await?;
            timeline
@@ -2150,7 +2191,7 @@ impl PageServerHandler {
                prev_lsn,
                full_backup,
                replica,
-                ctx,
+                &ctx,
            )
            .await
            .map_err(map_basebackup_error)?;
@@ -2173,7 +2214,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
-                    ctx,
+                    &ctx,
                )
                .await
                .map_err(map_basebackup_error)?;
@@ -2190,7 +2231,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
-                    ctx,
+                    &ctx,
                )
                .await
                .map_err(map_basebackup_error)?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
        // Otherwise, read the old reldir keyspace.
        // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.

-        if self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
+            self.get_rel_size_v2_status()
+        {
            // fetch directory listing (new)
            let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
            let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
                forknum: *forknum,
            }));

-        if !self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
            return Ok(rels_v1);
        }

@@ -599,28 +602,36 @@ impl Timeline {
        let n_blocks = self
            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
            .await?;
-        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
-        for blkno in 0..n_blocks {
-            let block = self
-                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
-                .await?;
-            segment.extend_from_slice(&block[..BLCKSZ as usize]);
-        }
-        Ok(segment.freeze())
-    }

-    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
-        &self,
-        kind: SlruKind,
-        segno: u32,
-        blknum: BlockNumber,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        assert!(self.tenant_shard_id.is_shard_zero());
-        let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn, ctx).await
+        let keyspace = KeySpace::single(
+            slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
+        );
+
+        let batches = keyspace.partition(
+            self.get_shard_identity(),
+            Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+        );
+
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for batch in batches.parts {
+            let blocks = self
+                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .await?;
+
+            for (_key, block) in blocks {
+                let block = block?;
+                segment.extend_from_slice(&block[..BLCKSZ as usize]);
+            }
+        }
+
+        Ok(segment.freeze())
    }

    /// Get size of an SLRU segment
@@ -829,19 +840,41 @@ impl Timeline {
            let nblocks = self
                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                .await?;
-            for blknum in (0..nblocks).rev() {
-                let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
+
+            let keyspace = KeySpace::single(
+                slru_block_to_key(SlruKind::Clog, segno, 0)
+                    ..slru_block_to_key(SlruKind::Clog, segno, nblocks),
+            );
+
+            let batches = keyspace.partition(
+                self.get_shard_identity(),
+                Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+            );
+
+            let io_concurrency = IoConcurrency::spawn_from_conf(
+                self.conf,
+                self.gate
+                    .enter()
+                    .map_err(|_| PageReconstructError::Cancelled)?,
+            );
+
+            for batch in batches.parts.into_iter().rev() {
+                let blocks = self
+                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
                    .await?;

-                if clog_page.len() == BLCKSZ as usize + 8 {
-                    let mut timestamp_bytes = [0u8; 8];
-                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
-                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+                for (_key, clog_page) in blocks.into_iter().rev() {
+                    let clog_page = clog_page?;

-                    match f(timestamp) {
-                        ControlFlow::Break(b) => return Ok(b),
-                        ControlFlow::Continue(()) => (),
+                    if clog_page.len() == BLCKSZ as usize + 8 {
+                        let mut timestamp_bytes = [0u8; 8];
+                        timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
+                        let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+
+                        match f(timestamp) {
+                            ControlFlow::Break(b) => return Ok(b),
+                            ControlFlow::Continue(()) => (),
+                        }
                    }
                }
            }
@@ -1052,6 +1085,8 @@ impl Timeline {
    ) -> Result<u64, CalculateLogicalSizeError> {
        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();

+        fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) });
+
        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
        let dbdir = DbDirectory::des(&buf)?;
@@ -1718,6 +1753,35 @@ impl DatadirModification<'_> {
        Ok(())
    }

+    /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
+    /// we enable it, we also need to persist it in `index_part.json`.
+    pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
+        let status = self.tline.get_rel_size_v2_status();
+        let config = self.tline.get_rel_size_v2_enabled();
+        match (config, status) {
+            (false, RelSizeMigration::Legacy) => {
+                // tenant config didn't enable it and we didn't write any reldir_v2 key yet
+                Ok(false)
+            }
+            (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                Ok(true)
+            }
+            (true, RelSizeMigration::Legacy) => {
+                // The first time we enable it, we need to persist it in `index_part.json`
+                self.tline
+                    .update_rel_size_v2_status(RelSizeMigration::Migrating)?;
+                tracing::info!("enabled rel_size_v2");
+                Ok(true)
+            }
+            (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                // and we don't need to do anything
+                Ok(true)
+            }
+        }
+    }
+
    /// Store a relmapper file (pg_filenode.map) in the repository
    pub async fn put_relmap_file(
        &mut self,
@@ -1726,6 +1790,8 @@ impl DatadirModification<'_> {
        img: Bytes,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+
        // Add it to the directory (if it doesn't exist already)
        let buf = self.get(DBDIR_KEY, ctx).await?;
        let mut dbdir = DbDirectory::des(&buf)?;
@@ -1746,7 +1812,7 @@ impl DatadirModification<'_> {
            })?;
            self.pending_directory_entries
                .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
-            if self.tline.get_rel_size_v2_enabled() {
+            if v2_enabled {
                self.pending_directory_entries
                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
            }
@@ -1898,12 +1964,12 @@ impl DatadirModification<'_> {
                .context("deserialize db")?
        };

-        // Add the new relation to the rel directory entry, and write it back
-        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            return Err(RelationError::AlreadyExists);
-        }
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;

-        if self.tline.get_rel_size_v2_enabled() {
+        if v2_enabled {
+            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
            let sparse_rel_dir_key =
                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
            // check if the rel_dir_key exists in v2
@@ -1938,6 +2004,10 @@ impl DatadirModification<'_> {
            self.pending_directory_entries
                .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
        } else {
+            // Add the new relation to the rel directory entry, and write it back
+            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
            if !dbdir_exists {
                self.pending_directory_entries
                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
@@ -1951,6 +2021,7 @@ impl DatadirModification<'_> {
                )),
            );
        }
+
        // Put size
        let size_key = rel_size_to_key(rel);
        let buf = nblocks.to_le_bytes();
@@ -2029,6 +2100,7 @@ impl DatadirModification<'_> {
        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
        for ((spc_node, db_node), rel_tags) in drop_relations {
            let dir_key = rel_dir_to_key(spc_node, db_node);
            let buf = self.get(dir_key, ctx).await?;
@@ -2041,7 +2113,7 @@ impl DatadirModification<'_> {
                        .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                    dirty = true;
                    true
-                } else if self.tline.get_rel_size_v2_enabled() {
+                } else if v2_enabled {
                    // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
                    // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
                    // logic).
@@ -2072,7 +2144,7 @@ impl DatadirModification<'_> {
                    // Remove entry from relation size cache
                    self.tline.remove_cached_rel_size(&rel_tag);

-                    // Delete size entry, as well as all blocks
+                    // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
                    self.delete(rel_key_range(rel_tag));
                }
            }
@@ -2686,7 +2758,7 @@ mod tests {
            TimelineId::from_array(hex!("11223344556677881122334455667788"));

        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
+        let (tline, ctx) = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,8 +31,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pageserver_api::models;
 pub use pageserver_api::models::TenantState;
+use pageserver_api::models::{self, RelSizeMigration};
 use pageserver_api::models::{
    CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
    WalRedoManagerStatus,
@@ -77,6 +77,8 @@ use self::timeline::{
    EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
 };
 use crate::config::PageServerConf;
+use crate::context;
+use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
@@ -1114,7 +1116,7 @@ impl Tenant {
            }
        };

-        let timeline = self.create_timeline_struct(
+        let (timeline, timeline_ctx) = self.create_timeline_struct(
            timeline_id,
            &metadata,
            previous_heatmap,
@@ -1123,6 +1125,8 @@ impl Tenant {
            CreateTimelineCause::Load,
            idempotency.clone(),
            index_part.gc_compaction.clone(),
+            index_part.rel_size_migration.clone(),
+            ctx,
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
@@ -1149,16 +1153,19 @@ impl Tenant {
        // a previous heatmap which contains all visible layers in the layer map.
        // This previous heatmap will be used whenever a fresh heatmap is generated
        // for the timeline.
-        if matches!(cause, LoadTimelineCause::Unoffload) {
+        if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
            let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
            while let Some((tline, end_lsn)) = tline_ending_at {
                let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
-                if !tline.is_previous_heatmap_active() {
+                // Another unearchived timeline might have generated a heatmap for this ancestor.
+                // If the current branch point greater than the previous one use the the heatmap
+                // we just generated - it should include more layers.
+                if !tline.should_keep_previous_heatmap(end_lsn) {
                    tline
                        .previous_heatmap
                        .store(Some(Arc::new(unarchival_heatmap)));
                } else {
-                    tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.")
+                    tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.")
                }

                match tline.ancestor_timeline() {
@@ -1253,7 +1260,7 @@ impl Tenant {
                        match activate {
                            ActivateTimelineArgs::Yes { broker_client } => {
                                info!("activating timeline after reload from pgdata import task");
-                                timeline.activate(self.clone(), broker_client, None, ctx);
+                                timeline.activate(self.clone(), broker_client, None, &timeline_ctx);
                            }
                            ActivateTimelineArgs::No => (),
                        }
@@ -1578,6 +1585,10 @@ impl Tenant {
    }

    async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
+        if !self.conf.load_previous_heatmap {
+            return None;
+        }
+
        let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
        match tokio::fs::read_to_string(on_disk_heatmap_path).await {
            Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
@@ -1757,6 +1768,7 @@ impl Tenant {
                        import_pgdata,
                        ActivateTimelineArgs::No,
                        guard,
+                        ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
                    ));
                }
            }
@@ -1774,6 +1786,7 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                remote_timeline_client,
+                ctx,
            )
            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
            .await
@@ -1939,6 +1952,7 @@ impl Tenant {
                hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
                    heatmap: h,
                    read_at: hs.1,
+                    end_lsn: None,
                })
            });
            part_downloads.spawn(
@@ -2210,7 +2224,7 @@ impl Tenant {
                self.clone(),
                broker_client.clone(),
                background_jobs_can_start,
-                &ctx,
+                &ctx.with_scope_timeline(&timeline),
            );
        }

@@ -2407,8 +2421,8 @@ impl Tenant {
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
-        _ctx: &RequestContext,
-    ) -> anyhow::Result<UninitializedTimeline> {
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
        anyhow::ensure!(
            self.is_active(),
            "Cannot create empty timelines on inactive tenant"
@@ -2442,6 +2456,8 @@ impl Tenant {
            create_guard,
            initdb_lsn,
            None,
+            None,
+            ctx,
        )
        .await
    }
@@ -2459,7 +2475,7 @@ impl Tenant {
        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_tl = self
+        let (uninit_tl, ctx) = self
            .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
            .await?;
        let tline = uninit_tl.raw_timeline().expect("we just created it");
@@ -2471,7 +2487,7 @@ impl Tenant {
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
-            .commit(ctx)
+            .commit(&ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

@@ -2497,6 +2513,7 @@ impl Tenant {
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
+        in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
@@ -2518,6 +2535,11 @@ impl Tenant {
                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                .await?;
        }
+        for in_memory in in_memory_layer_desc {
+            tline
+                .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
+                .await?;
+        }
        let layer_names = tline
            .layers
            .read()
@@ -2683,7 +2705,12 @@ impl Tenant {
        // doing stuff before the IndexPart is durable in S3, which is done by the previous section.
        let activated_timeline = match result {
            CreateTimelineResult::Created(timeline) => {
-                timeline.activate(self.clone(), broker_client, None, ctx);
+                timeline.activate(
+                    self.clone(),
+                    broker_client,
+                    None,
+                    &ctx.with_scope_timeline(&timeline),
+                );
                timeline
            }
            CreateTimelineResult::Idempotent(timeline) => {
@@ -2745,10 +2772,9 @@ impl Tenant {
            }
        };

-        let mut uninit_timeline = {
+        let (mut uninit_timeline, timeline_ctx) = {
            let this = &self;
            let initdb_lsn = Lsn(0);
-            let _ctx = ctx;
            async move {
                let new_metadata = TimelineMetadata::new(
                    // Initialize disk_consistent LSN to 0, The caller must import some data to
@@ -2767,6 +2793,8 @@ impl Tenant {
                    timeline_create_guard,
                    initdb_lsn,
                    None,
+                    None,
+                    ctx,
                )
                .await
            }
@@ -2796,6 +2824,7 @@ impl Tenant {
            index_part,
            activate,
            timeline_create_guard,
+            timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
        ));

        // NB: the timeline doesn't exist in self.timelines at this point
@@ -2809,6 +2838,7 @@ impl Tenant {
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
        timeline_create_guard: TimelineCreateGuard,
+        ctx: RequestContext,
    ) {
        debug_assert_current_span_has_tenant_and_timeline_id();
        info!("starting");
@@ -2820,6 +2850,7 @@ impl Tenant {
                index_part,
                activate,
                timeline_create_guard,
+                ctx,
            )
            .await;
        if let Err(err) = &res {
@@ -2835,9 +2866,8 @@ impl Tenant {
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
        timeline_create_guard: TimelineCreateGuard,
+        ctx: RequestContext,
    ) -> Result<(), anyhow::Error> {
-        let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
-
        info!("importing pgdata");
        import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
            .await
@@ -3046,6 +3076,7 @@ impl Tenant {

            let mut has_pending_l0 = false;
            for timeline in compact_l0 {
+                let ctx = &ctx.with_scope_timeline(&timeline);
                let outcome = timeline
                    .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
                    .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
@@ -3079,6 +3110,7 @@ impl Tenant {
            if !timeline.is_active() {
                continue;
            }
+            let ctx = &ctx.with_scope_timeline(&timeline);

            let mut outcome = timeline
                .compact(cancel, EnumSet::default(), ctx)
@@ -3141,11 +3173,13 @@ impl Tenant {
    /// Trips the compaction circuit breaker if appropriate.
    pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
        match err {
+            err if err.is_cancel() => {}
            CompactionError::ShuttingDown => (),
            // Offload failures don't trip the circuit breaker, since they're cheap to retry and
            // shouldn't block compaction.
            CompactionError::Offload(_) => {}
            CompactionError::CollectKeySpaceError(err) => {
+                // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
@@ -3302,7 +3336,7 @@ impl Tenant {
                    self.clone(),
                    broker_client.clone(),
                    background_jobs_can_start,
-                    ctx,
+                    &ctx.with_scope_timeline(timeline),
                );
                activated_timelines += 1;
            }
@@ -4116,7 +4150,9 @@ impl Tenant {
        cause: CreateTimelineCause,
        create_idempotency: CreateTimelineIdempotency,
        gc_compaction_state: Option<GcCompactionState>,
-    ) -> anyhow::Result<Arc<Timeline>> {
+        rel_size_v2_status: Option<RelSizeMigration>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(Arc<Timeline>, RequestContext)> {
        let state = match cause {
            CreateTimelineCause::Load => {
                let ancestor_id = new_metadata.ancestor_timeline();
@@ -4148,10 +4184,15 @@ impl Tenant {
            self.attach_wal_lag_cooldown.clone(),
            create_idempotency,
            gc_compaction_state,
+            rel_size_v2_status,
            self.cancel.child_token(),
        );

-        Ok(timeline)
+        let timeline_ctx = RequestContextBuilder::extend(ctx)
+            .scope(context::Scope::new_timeline(&timeline))
+            .build();
+
+        Ok((timeline, timeline_ctx))
    }

    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
@@ -4567,6 +4608,7 @@ impl Tenant {
        // Ensures all timelines use the same start time when computing the time cutoff.
        let now_ts_for_pitr_calc = SystemTime::now();
        for timeline in timelines.iter() {
+            let ctx = &ctx.with_scope_timeline(timeline);
            let cutoff = timeline
                .get_last_record_lsn()
                .checked_sub(horizon)
@@ -4740,7 +4782,7 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

@@ -4843,13 +4885,15 @@ impl Tenant {
            src_timeline.pg_version,
        );

-        let uninitialized_timeline = self
+        let (uninitialized_timeline, _timeline_ctx) = self
            .prepare_new_timeline(
                dst_id,
                &metadata,
                timeline_create_guard,
                start_lsn + 1,
                Some(Arc::clone(src_timeline)),
+                Some(src_timeline.get_rel_size_v2_status()),
+                ctx,
            )
            .await?;

@@ -5116,13 +5160,15 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let mut raw_timeline = self
+        let (mut raw_timeline, timeline_ctx) = self
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
                timeline_create_guard,
                pgdata_lsn,
                None,
+                None,
+                ctx,
            )
            .await?;

@@ -5133,7 +5179,7 @@ impl Tenant {
                    &unfinished_timeline,
                    &pgdata_path,
                    pgdata_lsn,
-                    ctx,
+                    &timeline_ctx,
                )
                .await
                .with_context(|| {
@@ -5194,6 +5240,7 @@ impl Tenant {
    /// An empty layer map is initialized, and new data and WAL can be imported starting
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
    /// `finish_creation` to insert the Timeline into the timelines map.
+    #[allow(clippy::too_many_arguments)]
    async fn prepare_new_timeline<'a>(
        &'a self,
        new_timeline_id: TimelineId,
@@ -5201,15 +5248,17 @@ impl Tenant {
        create_guard: TimelineCreateGuard,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
-    ) -> anyhow::Result<UninitializedTimeline<'a>> {
+        rel_size_v2_status: Option<RelSizeMigration>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> {
        let tenant_shard_id = self.tenant_shard_id;

        let resources = self.build_timeline_resources(new_timeline_id);
        resources
            .remote_client
-            .init_upload_queue_for_empty_remote(new_metadata)?;
+            .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;

-        let timeline_struct = self
+        let (timeline_struct, timeline_ctx) = self
            .create_timeline_struct(
                new_timeline_id,
                new_metadata,
@@ -5219,6 +5268,8 @@ impl Tenant {
                CreateTimelineCause::Load,
                create_guard.idempotency.clone(),
                None,
+                rel_size_v2_status,
+                ctx,
            )
            .context("Failed to create timeline data structure")?;

@@ -5239,10 +5290,13 @@ impl Tenant {
            "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}"
        );

-        Ok(UninitializedTimeline::new(
-            self,
-            new_timeline_id,
-            Some((timeline_struct, create_guard)),
+        Ok((
+            UninitializedTimeline::new(
+                self,
+                new_timeline_id,
+                Some((timeline_struct, create_guard)),
+            ),
+            timeline_ctx,
        ))
    }

@@ -5777,7 +5831,8 @@ pub(crate) mod harness {
        }

        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
-            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
+                .with_scope_unit_test();
            (
                self.do_try_load(&ctx)
                    .await
@@ -5907,6 +5962,8 @@ mod tests {
    #[cfg(feature = "testing")]
    use timeline::GcInfo;
    #[cfg(feature = "testing")]
+    use timeline::InMemoryLayerTestDesc;
+    #[cfg(feature = "testing")]
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    use timeline::{CompactOptions, DeltaLayerTestDesc};
    use utils::id::TenantId;
@@ -6798,7 +6855,7 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let io_concurrency = IoConcurrency::spawn_for_test();
-        let tline = tenant
+        let (tline, ctx) = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();
@@ -7420,7 +7477,7 @@ mod tests {
            .await;

        let initdb_lsn = Lsn(0x20);
-        let utline = tenant
+        let (utline, ctx) = tenant
            .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = utline.raw_timeline().unwrap();
@@ -7487,7 +7544,7 @@ mod tests {
        let harness = TenantHarness::create(name).await?;
        {
            let (tenant, ctx) = harness.load().await;
-            let tline = tenant
+            let (tline, _ctx) = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
@@ -7919,6 +7976,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(), // delta layers
                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
@@ -8006,6 +8064,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(), // delta layers
                vec![(
                    Lsn(0x20),
@@ -8221,6 +8280,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8301,6 +8361,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8374,6 +8435,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8506,6 +8568,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -8699,6 +8762,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    Lsn(0x10)..Lsn(0x40),
                    delta1,
@@ -8755,6 +8819,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(),
                image_layers,
                end_lsn,
@@ -8961,6 +9026,7 @@ mod tests {
                    Lsn(0x08),
                    DEFAULT_PG_VERSION,
                    &ctx,
+                    Vec::new(), // in-memory layers
                    vec![
                        DeltaLayerTestDesc::new_with_inferred_key_range(
                            Lsn(0x08)..Lsn(0x10),
@@ -8979,7 +9045,7 @@ mod tests {
                            delta3,
                        ),
                    ], // delta layers
-                    vec![], // image layers
+                    vec![],     // image layers
                    Lsn(0x50),
                )
                .await?
@@ -8990,6 +9056,7 @@ mod tests {
                    Lsn(0x10),
                    DEFAULT_PG_VERSION,
                    &ctx,
+                    Vec::new(), // in-memory layers
                    vec![
                        DeltaLayerTestDesc::new_with_inferred_key_range(
                            Lsn(0x10)..Lsn(0x48),
@@ -9540,6 +9607,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
@@ -9787,6 +9855,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    // delta1 and delta 2 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
@@ -10022,6 +10091,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![],                       // in-memory layers
                vec![],                       // delta layers
                vec![(Lsn(0x18), img_layer)], // image layers
                Lsn(0x18),
@@ -10268,6 +10338,7 @@ mod tests {
                baseline_image_layer_lsn,
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    delta_layer_start_lsn..delta_layer_end_lsn,
                    delta_layer_spec,
@@ -10299,6 +10370,158 @@ mod tests {
        Ok(())
    }

+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
+        let harness =
+            TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let frozen_layer = {
+            let lsn_range = Lsn(0x40)..Lsn(0x60);
+            let mut data = Vec::new();
+            for i in 0..10 {
+                let key = get_key(i);
+                let key_in_nested = nested_img_layer
+                    .iter()
+                    .any(|(key_with_img, _)| *key_with_img == key);
+                let lsn = {
+                    if key_in_nested {
+                        Lsn(nested_image_layer_lsn.0 + 5)
+                    } else {
+                        lsn_range.start
+                    }
+                };
+
+                let will_init = will_init_keys.contains(&i);
+                if will_init {
+                    data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
+
+                    expected_key_values.insert(key, "".to_string());
+                } else {
+                    let delta = format!("@{lsn}");
+                    data.push((
+                        key,
+                        lsn,
+                        Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                    ));
+
+                    expected_key_values
+                        .get_mut(&key)
+                        .expect("An image exists for each key")
+                        .push_str(delta.as_str());
+                }
+            }
+
+            InMemoryLayerTestDesc {
+                lsn_range,
+                is_open: false,
+                data,
+            }
+        };
+
+        let (open_layer, last_record_lsn) = {
+            let start_lsn = Lsn(0x70);
+            let mut data = Vec::new();
+            let mut end_lsn = Lsn(0);
+            for i in 0..10 {
+                let key = get_key(i);
+                let lsn = Lsn(start_lsn.0 + i as u64);
+                let delta = format!("@{lsn}");
+                data.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+
+                end_lsn = std::cmp::max(end_lsn, lsn);
+            }
+
+            (
+                InMemoryLayerTestDesc {
+                    lsn_range: start_lsn..Lsn::MAX,
+                    is_open: true,
+                    data,
+                },
+                end_lsn,
+            )
+        };
+
+        assert!(
+            nested_image_layer_lsn > frozen_layer.lsn_range.start
+                && nested_image_layer_lsn < frozen_layer.lsn_range.end
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![open_layer, frozen_layer], // in-memory layers
+                Vec::new(),                     // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                last_record_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value.clone()));
+
+            tracing::info!("key={key} value={expected_value}");
+        }
+
+        Ok(())
+    }
+
    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
        (
            k1.is_delta,
@@ -10414,6 +10637,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -10798,6 +11022,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    // delta1/2/4 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
@@ -11049,6 +11274,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    // delta1/2/4 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -382,7 +382,8 @@ pub(crate) mod tests {
    }

    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let (_temp_dir, pathbuf, offsets) =
            write_maybe_compressed(blobs, compression, &ctx).await?;

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -32,8 +32,7 @@ use hex;
 use thiserror::Error;
 use tracing::error;

-use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::TaskKind;
+use crate::context::RequestContext;
 use crate::tenant::block_io::{BlockReader, BlockWriter};
 use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer};

@@ -478,16 +477,15 @@ where
    }

    #[allow(dead_code)]
-    pub async fn dump(&self) -> Result<()> {
+    pub async fn dump(&self, ctx: &RequestContext) -> Result<()> {
        let mut stack = Vec::new();
-        let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

        stack.push((self.root_blk, String::new(), 0, 0, 0));

        let block_cursor = self.reader.block_cursor();

        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum, ctx).await?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -836,6 +834,8 @@ pub(crate) mod tests {
    use rand::Rng;

    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};

    #[derive(Clone, Default)]
@@ -870,7 +870,8 @@ pub(crate) mod tests {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        let all_keys: Vec<&[u8; 6]> = vec![
            b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
@@ -888,7 +889,7 @@ pub(crate) mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump(&ctx).await?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -980,7 +981,8 @@ pub(crate) mod tests {
    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        const NUM_KEYS: u64 = 1000;

@@ -998,7 +1000,7 @@ pub(crate) mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump(&ctx).await?;

        use std::sync::Mutex;

@@ -1168,7 +1170,8 @@ pub(crate) mod tests {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        for (key, val) in disk_btree_test_data::TEST_DATA {
            writer.append(&key, val)?;
@@ -1199,7 +1202,7 @@ pub(crate) mod tests {
            .await?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump().await?;
+        reader.dump(&ctx).await?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -352,7 +352,8 @@ mod tests {
        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
        fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;

-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();

        Ok((conf, tenant_shard_id, timeline_id, ctx))
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
    /// The latest state
    head: LayerCoverageTuple<Value>,

+    /// TODO: this could be an ordered vec using binary search.
+    /// We push into this map everytime we add a layer, so might see some benefit
    /// All previous states
    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
 }
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
    buffer: BTreeMap<LayerKey, Option<Value>>,

    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
+    // [`Self::historic_coverage`] instead of doubling memory usage.
+    // [`Self::len`]: can require rebuild and serve from latest historic
+    // [`Self::iter`]: already requires rebuild => can serve from latest historic
    layers: BTreeMap<LayerKey, Value>,
 }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -194,7 +194,7 @@ pub(crate) use download::{
 };
 use index::GcCompactionState;
 pub(crate) use index::LayerFileMetadata;
-use pageserver_api::models::TimelineArchivalState;
+use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use regex::Regex;
 use remote_storage::{
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {

    /// Initialize the upload queue for the case where the remote storage is empty,
    /// i.e., it doesn't have an `IndexPart`.
+    ///
+    /// `rel_size_v2_status` needs to be carried over during branching, and that's why
+    /// it's passed in here.
    pub fn init_upload_queue_for_empty_remote(
        &self,
        local_metadata: &TimelineMetadata,
+        rel_size_v2_status: Option<RelSizeMigration>,
    ) -> anyhow::Result<()> {
        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
        // certainly no point in starting more upload tasks than this.
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
            .as_ref()
            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        let initialized_queue =
+            upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
        self.update_remote_physical_size_gauge(None);
        info!("initialized upload queue as empty");
        Ok(())
@@ -900,7 +906,7 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
    pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
        self: &Arc<Self>,
        gc_compaction_state: GcCompactionState,
@@ -912,6 +918,21 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
+    pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
+        self: &Arc<Self>,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
+        // TODO: allow this operation to bypass the validation check because we might upload the index part
+        // with no layers but the flag updated. For now, we just modify the index part in memory and the next
+        // upload will include the flag.
+        // self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -933,6 +954,14 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
+    pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
    /// Launch an index-file upload operation in the background (internal function)
    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,6 +7,7 @@ use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::shard::ShardIndex;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
    pub(crate) last_completed_lsn: Lsn,
 }

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub enum RelSizeMigration {
-    /// The tenant is using the old rel_size format.
-    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
-    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
-    Legacy,
-    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
-    /// persisted in the index part. The read path will read both formats and merge them.
-    Migrating,
-    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
-    /// in the index part, and the read path will not read the old format.
-    Migrated,
-}
-
 impl IndexPart {
    /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
    /// used to understand later versions.
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -491,7 +491,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
        let remote_storage = self.remote_storage.clone();
        let conf = self.tenant_manager.get_conf();
        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
-        let download_ctx = self.root_ctx.attached_child();
+        let download_ctx = self
+            .root_ctx
+            .attached_child()
+            .with_scope_secondary_tenant(&tenant_shard_id);
        (RunningDownload { barrier }, Box::pin(async move {
            let _completion = completion;

@@ -771,6 +774,7 @@ impl<'a> TenantDownloader<'a> {

        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
+            let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id);
            let timeline_state = timeline_states
                .remove(&timeline.timeline_id)
                .expect("Just populated above");
@@ -869,8 +873,7 @@ impl<'a> TenantDownloader<'a> {
                let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();

                let layers_in_heatmap = heatmap_timeline
-                    .layers
-                    .iter()
+                    .hot_layers()
                    .map(|l| (&l.name, l.metadata.generation))
                    .collect::<HashSet<_>>();
                let layers_on_disk = timeline_state
@@ -1015,7 +1018,8 @@ impl<'a> TenantDownloader<'a> {
        // Accumulate updates to the state
        let mut touched = Vec::new();

-        for layer in timeline.layers {
+        let timeline_id = timeline.timeline_id;
+        for layer in timeline.into_hot_layers() {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
                return (Err(UpdateError::Cancelled), touched);
@@ -1040,7 +1044,7 @@ impl<'a> TenantDownloader<'a> {
            }

            match self
-                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .download_layer(tenant_shard_id, &timeline_id, layer, ctx)
                .await
            {
                Ok(Some(layer)) => touched.push(layer),
@@ -1148,7 +1152,7 @@ impl<'a> TenantDownloader<'a> {
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
        let timeline_id = timeline.timeline_id;

-        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());

        let (result, touched) = self
            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
@@ -1316,11 +1320,11 @@ async fn init_timeline_state(
    // As we iterate through layers found on disk, we will look up their metadata from this map.
    // Layers not present in metadata will be discarded.
    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
-        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+        heatmap.hot_layers().map(|l| (&l.name, l)).collect();

    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
        if let Some(last_heatmap) = last_heatmap {
-            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+            last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
        } else {
            HashMap::new()
        };
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
    #[serde_as(as = "DisplayFromStr")]
    pub(crate) timeline_id: TimelineId,

-    pub(crate) layers: Vec<HeatMapLayer>,
+    layers: Vec<HeatMapLayer>,
 }

 #[serde_as]
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(crate) access_time: SystemTime,
-    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
-    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
+
+    #[serde(default)]
+    pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
+                           // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
 }

 impl HeatMapLayer {
@@ -62,11 +64,13 @@ impl HeatMapLayer {
        name: LayerName,
        metadata: LayerFileMetadata,
        access_time: SystemTime,
+        cold: bool,
    ) -> Self {
        Self {
            name,
            metadata,
            access_time,
+            cold,
        }
    }
 }
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
            layers,
        }
    }
+
+    pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
+        self.layers.into_iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter()
+    }
 }

 pub(crate) struct HeatMapStats {
@@ -92,7 +108,7 @@ impl HeatMapTenant {
            layers: 0,
        };
        for timeline in &self.timelines {
-            for layer in &timeline.layers {
+            for layer in timeline.hot_layers() {
                stats.layers += 1;
                stats.bytes += layer.metadata.file_size;
            }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -474,7 +474,7 @@ async fn fill_logical_sizes(
            if cached_size.is_none() {
                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
                let parallel_size_calcs = Arc::clone(limit);
-                let ctx = ctx.attached_child();
+                let ctx = ctx.attached_child().with_scope_timeline(&timeline);
                joinset.spawn(
                    calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
                        .in_current_span(),
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;

 use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
+use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
 use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -721,6 +722,12 @@ struct LayerToVisitId {
    lsn_floor: Lsn,
 }

+#[derive(Debug, PartialEq, Eq, Hash)]
+pub enum ReadableLayerWeak {
+    PersistentLayer(Arc<PersistentLayerDesc>),
+    InMemoryLayer(InMemoryLayerDesc),
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -873,7 +880,7 @@ impl ReadableLayer {
            }
            ReadableLayer::InMemoryLayer(layer) => {
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1385,7 +1385,7 @@ impl DeltaLayerInner {
            block_reader,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump(ctx).await?;

        let keys = self.index_entries(ctx).await?;

@@ -2024,6 +2024,7 @@ pub(crate) mod test {
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
            .await
            .unwrap();
+        let ctx = &ctx.with_scope_timeline(&timeline);

        let initdb_layer = timeline
            .layers
@@ -2136,7 +2137,7 @@ pub(crate) mod test {
            .await
            .unwrap();

-            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
+            let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap();

            new_layer
                .copy_delta_prefix(&mut writer, truncate_at, ctx)
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -208,7 +208,7 @@ impl ImageLayerInner {
            block_reader,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump(ctx).await?;

        tree_reader
            .visit(
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -416,7 +416,7 @@ impl InMemoryLayer {
    pub(crate) async fn get_values_reconstruct_data(
        self: &Arc<InMemoryLayer>,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
@@ -433,8 +433,6 @@ impl InMemoryLayer {
        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();

-        let lsn_range = self.start_lsn..end_lsn;
-
        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
                .index
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -324,16 +324,16 @@ impl Layer {
        reconstruct_data: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let downloaded = self
-            .0
-            .get_or_maybe_download(true, Some(ctx))
-            .await
-            .map_err(|err| match err {
-                DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
-                    GetVectoredError::Cancelled
-                }
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
+        let downloaded =
+            self.0
+                .get_or_maybe_download(true, ctx)
+                .await
+                .map_err(|err| match err {
+                    DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
+                        GetVectoredError::Cancelled
+                    }
+                    other => GetVectoredError::Other(anyhow::anyhow!(other)),
+                })?;
        let this = ResidentLayer {
            downloaded: downloaded.clone(),
            owner: self.clone(),
@@ -356,8 +356,8 @@ impl Layer {
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
-        self.0.get_or_maybe_download(true, None).await?;
+    pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
+        self.0.get_or_maybe_download(true, ctx).await?;
        Ok(())
    }

@@ -392,8 +392,11 @@ impl Layer {
    }

    /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
-    pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
-        let downloaded = self.0.get_or_maybe_download(true, None).await?;
+    pub(crate) async fn download_and_keep_resident(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<ResidentLayer, DownloadError> {
+        let downloaded = self.0.get_or_maybe_download(true, ctx).await?;

        Ok(ResidentLayer {
            downloaded,
@@ -446,7 +449,7 @@ impl Layer {

        if verbose {
            // for now, unconditionally download everything, even if that might not be wanted.
-            let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
+            let l = self.0.get_or_maybe_download(true, ctx).await?;
            l.dump(&self.0, ctx).await?
        }

@@ -945,7 +948,7 @@ impl LayerInner {
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
-        ctx: Option<&RequestContext>,
+        ctx: &RequestContext,
    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
        let (weak, permit) = {
            // get_or_init_detached can:
@@ -1035,21 +1038,14 @@ impl LayerInner {
            return Err(DownloadError::NotFile(ft));
        }

-        if let Some(ctx) = ctx {
-            self.check_expected_download(ctx)?;
-        }
+        self.check_expected_download(ctx)?;

        if !allow_download {
            // this is only used from tests, but it is hard to test without the boolean
            return Err(DownloadError::DownloadRequired);
        }

-        let download_ctx = ctx
-            .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
-            .unwrap_or(RequestContext::new(
-                TaskKind::LayerDownload,
-                DownloadBehavior::Download,
-            ));
+        let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download);

        async move {
            tracing::info!(%reason, "downloading on-demand");
@@ -1567,10 +1563,10 @@ impl LayerInner {

        self.access_stats.record_residence_event();

-        self.status.as_ref().unwrap().send_replace(Status::Evicted);
-
        *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());

+        self.status.as_ref().unwrap().send_replace(Status::Evicted);
+
        Ok(())
    }

--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -8,7 +8,6 @@ use utils::id::TimelineId;
 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
 use crate::context::DownloadBehavior;
-use crate::task_mgr::TaskKind;
 use crate::tenant::harness::{TenantHarness, test_img};
 use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};

@@ -27,11 +26,9 @@ async fn smoke_test() {
    let h = TenantHarness::create("smoke_test").await.unwrap();
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-    let (tenant, _) = h.load().await;
+    let (tenant, ctx) = h.load().await;
    let io_concurrency = IoConcurrency::spawn_for_test();

-    let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
-
    let image_layers = vec![(
        Lsn(0x40),
        vec![(
@@ -49,12 +46,14 @@ async fn smoke_test() {
            Lsn(0x10),
            14,
            &ctx,
+            Default::default(), // in-memory layers
            Default::default(),
            image_layers,
            Lsn(0x100),
        )
        .await
        .unwrap();
+    let ctx = &ctx.with_scope_timeline(&timeline);

    // Grab one of the timeline's layers to exercise in the test, and the other layer that is just
    // there to avoid the timeline being illegally empty
@@ -93,7 +92,7 @@ async fn smoke_test() {
                controlfile_keyspace.clone(),
                Lsn(0x10)..Lsn(0x11),
                &mut data,
-                &ctx,
+                ctx,
            )
            .await
            .unwrap();
@@ -128,7 +127,7 @@ async fn smoke_test() {
                controlfile_keyspace.clone(),
                Lsn(0x10)..Lsn(0x11),
                &mut data,
-                &ctx,
+                ctx,
            )
            .instrument(download_span.clone())
            .await
@@ -178,7 +177,7 @@ async fn smoke_test() {

    // plain downloading is rarely needed
    layer
-        .download_and_keep_resident()
+        .download_and_keep_resident(ctx)
        .instrument(download_span)
        .await
        .unwrap();
@@ -340,6 +339,7 @@ fn read_wins_pending_eviction() {
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
            .unwrap();
+        let ctx = ctx.with_scope_timeline(&timeline);

        let layer = {
            let mut layers = {
@@ -379,7 +379,7 @@ fn read_wins_pending_eviction() {
        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
        layer
            .0
-            .get_or_maybe_download(false, None)
+            .get_or_maybe_download(false, &ctx)
            .instrument(download_span)
            .await
            .expect("should had reinitialized without downloading");
@@ -472,6 +472,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
            .unwrap();
+        let ctx = ctx.with_scope_timeline(&timeline);

        let layer = {
            let mut layers = {
@@ -514,7 +515,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
        layer
            .0
-            .get_or_maybe_download(false, None)
+            .get_or_maybe_download(false, &ctx)
            .instrument(download_span)
            .await
            .expect("should had reinitialized without downloading");
@@ -641,7 +642,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();
+    let ctx = ctx.with_scope_timeline(&timeline);

+    // This test does downloads
+    let ctx = RequestContextBuilder::extend(&ctx)
+        .download_behavior(DownloadBehavior::Download)
+        .build();
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
@@ -674,7 +680,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    // simulate a cancelled read which is cancelled before it gets to re-initialize
    let e = layer
        .0
-        .get_or_maybe_download(false, None)
+        .get_or_maybe_download(false, &ctx)
        .await
        .unwrap_err();
    assert!(
@@ -698,7 +704,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    // failpoint is still enabled, but it is not hit
    let e = layer
        .0
-        .get_or_maybe_download(false, None)
+        .get_or_maybe_download(false, &ctx)
        .await
        .unwrap_err();
    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
@@ -721,6 +727,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();
+    let ctx = ctx.with_scope_timeline(&timeline);
+
+    // This test does downloads
+    let ctx = RequestContextBuilder::extend(&ctx)
+        .download_behavior(DownloadBehavior::Download)
+        .build();

    let layer = {
        let mut layers = {
@@ -768,7 +780,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let mut download = std::pin::pin!(
        layer
            .0
-            .get_or_maybe_download(true, None)
+            .get_or_maybe_download(true, &ctx)
            .instrument(download_span)
    );

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -289,15 +289,14 @@ fn log_compaction_error(
 ) {
    use CompactionError::*;

-    use crate::pgdatadir_mapping::CollectKeySpaceError;
    use crate::tenant::PageReconstructError;
    use crate::tenant::upload_queue::NotInitialized;

    let level = match err {
+        e if e.is_cancel() => return,
        ShuttingDown => return,
        Offload(_) => Level::ERROR,
        AlreadyRunning(_) => Level::ERROR,
-        CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
        CollectKeySpaceError(_) => Level::ERROR,
        _ if task_cancelled => Level::INFO,
        Other(err) => {
@@ -474,21 +473,15 @@ async fn wait_for_active_tenant(
    }

    let mut update_rx = tenant.subscribe_for_state_updates();
-    loop {
-        tokio::select! {
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            result = update_rx.changed() => if result.is_err() {
+    tokio::select! {
+        result = update_rx.wait_for(|s| s == &TenantState::Active) => {
+            if result.is_err() {
                return ControlFlow::Break(());
            }
-        }
-
-        match &*update_rx.borrow() {
-            TenantState::Active => {
-                debug!("Tenant state changed to active, continuing the task loop");
-                return ControlFlow::Continue(());
-            }
-            state => debug!("Not running the task loop, tenant is not active: {state:?}"),
-        }
+            debug!("Tenant state changed to active, continuing the task loop");
+            ControlFlow::Continue(())
+        },
+        _ = cancel.cancelled() => ControlFlow::Break(()),
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
 use pageserver_api::models::{
    CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
+    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
 };
 use pageserver_api::reltag::{BlockNumber, RelTag};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::metrics::{
-    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
+    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
+    LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
 };
 use crate::page_service::TenantManagerTypes;
 use crate::pgdatadir_mapping::{
@@ -286,7 +287,7 @@ pub struct Timeline {
    // The LSN of gc-compaction that was last applied to this timeline.
    gc_compaction_state: ArcSwap<Option<GcCompactionState>>,

-    pub(super) metrics: TimelineMetrics,
+    pub(crate) metrics: Arc<TimelineMetrics>,

    // `Timeline` doesn't write these metrics itself, but it manages the lifetime.  Code
    // in `crate::page_service` writes these metrics.
@@ -436,12 +437,16 @@ pub struct Timeline {
    /// May host a background Tokio task which downloads all the layers from the current
    /// heatmap on demand.
    heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
+
+    pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
 }

 pub(crate) enum PreviousHeatmap {
    Active {
        heatmap: HeatMapTimeline,
        read_at: std::time::Instant,
+        // End LSN covered by the heatmap if known
+        end_lsn: Option<Lsn>,
    },
    Obsolete,
 }
@@ -1326,10 +1331,6 @@ impl Timeline {
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
        if !results.is_empty() {
-            // Record the total number of layers visited towards each key in the batch. While some
-            // layers may not intersect with a given read, and the cost of layer visits are
-            // amortized across the batch, each visited layer contributes directly to the observed
-            // latency for every read in the batch, which is what we care about.
            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
@@ -1344,9 +1345,23 @@ impl Timeline {
                });
            }

+            // Records the number of layers visited in a few different ways:
+            //
+            // * LAYERS_PER_READ: all layers count towards every read in the batch, because each
+            //   layer directly affects its observed latency.
+            //
+            // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
+            //   layer visits and access cost.
+            //
+            // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
+            //   read amplification after batching.
+            let layers_visited = layers_visited as f64;
+            let avg_layers_visited = layers_visited / results.len() as f64;
+            LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
            for _ in &results {
-                self.metrics.layers_per_read.observe(layers_visited as f64);
-                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+                self.metrics.layers_per_read.observe(layers_visited);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited);
+                LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
            }
        }

@@ -1864,16 +1879,25 @@ impl Timeline {
        };

        // Signal compaction failure to avoid L0 flush stalls when it's broken.
-        match result {
+        match &result {
            Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
-            Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
+            Err(e) if e.is_cancel() => {}
+            Err(CompactionError::ShuttingDown) => {
+                // Covered by the `Err(e) if e.is_cancel()` branch.
+            }
+            Err(CompactionError::AlreadyRunning(_)) => {
+                // Covered by the `Err(e) if e.is_cancel()` branch.
+            }
+            Err(CompactionError::Other(_)) => {
+                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
+            }
+            Err(CompactionError::CollectKeySpaceError(_)) => {
+                // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
            }
            // Don't change the current value on offload failure or shutdown. We don't want to
            // abruptly stall nor resume L0 flushes in these cases.
            Err(CompactionError::Offload(_)) => {}
-            Err(CompactionError::ShuttingDown) => {}
-            Err(CompactionError::AlreadyRunning(_)) => {}
        };

        result
@@ -2188,6 +2212,7 @@ impl Timeline {
    pub(crate) async fn download_layer(
        &self,
        layer_file_name: &LayerName,
+        ctx: &RequestContext,
    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
        let Some(layer) = self
            .find_layer(layer_file_name)
@@ -2201,7 +2226,7 @@ impl Timeline {
            return Ok(None);
        };

-        layer.download().await?;
+        layer.download(ctx).await?;

        Ok(Some(true))
    }
@@ -2356,6 +2381,9 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
+    /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
+    /// possible that the index part persists the state while the config doesn't get persisted.
    pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2364,6 +2392,14 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
    }

+    pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
+        self.rel_size_v2_status
+            .load()
+            .as_ref()
+            .map(|s| s.as_ref().clone())
+            .unwrap_or(RelSizeMigration::Legacy)
+    }
+
    fn get_compaction_upper_limit(&self) -> usize {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2624,6 +2660,7 @@ impl Timeline {
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        create_idempotency: crate::tenant::CreateTimelineIdempotency,
        gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2648,14 +2685,14 @@ impl Timeline {
        }

        Arc::new_cyclic(|myself| {
-            let metrics = TimelineMetrics::new(
+            let metrics = Arc::new(TimelineMetrics::new(
                &tenant_shard_id,
                &timeline_id,
                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
                    "mtime",
                    evictions_low_residence_duration_metric_threshold,
                ),
-            );
+            ));
            let aux_file_metrics = metrics.aux_file_size_gauge.clone();

            let mut result = Timeline {
@@ -2782,6 +2819,8 @@ impl Timeline {
                previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),

                heatmap_layers_downloader: Mutex::new(None),
+
+                rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
            };

            result.repartition_threshold =
@@ -2837,7 +2876,7 @@ impl Timeline {
            "layer flush task",
            async move {
                let _guard = guard;
-                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error).with_scope_timeline(&self_clone);
                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
                assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
@@ -2858,6 +2897,16 @@ impl Timeline {
            .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
    }

+    pub(crate) fn update_rel_size_v2_status(
+        &self,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        self.rel_size_v2_status
+            .store(Some(Arc::new(rel_size_v2_status.clone())));
+        self.remote_client
+            .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
+    }
+
    pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
        self.gc_compaction_state.load_full().as_ref().clone()
    }
@@ -3560,12 +3609,16 @@ impl Timeline {
        Ok(layer)
    }

-    pub(super) fn is_previous_heatmap_active(&self) -> bool {
-        self.previous_heatmap
-            .load()
-            .as_ref()
-            .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. }))
-            .unwrap_or(false)
+    pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool {
+        let crnt = self.previous_heatmap.load();
+        match crnt.as_deref() {
+            Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn {
+                Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn,
+                None => true,
+            },
+            Some(PreviousHeatmap::Obsolete) => false,
+            None => false,
+        }
    }

    /// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -3593,26 +3646,26 @@ impl Timeline {
        // heatamp.
        let previous_heatmap = self.previous_heatmap.load();
        let visible_non_resident = match previous_heatmap.as_deref() {
-            Some(PreviousHeatmap::Active { heatmap, read_at }) => {
-                Some(heatmap.layers.iter().filter_map(|hl| {
-                    let desc: PersistentLayerDesc = hl.name.clone().into();
-                    let layer = guard.try_get_from_key(&desc.key())?;
+            Some(PreviousHeatmap::Active {
+                heatmap, read_at, ..
+            }) => Some(heatmap.all_layers().filter_map(|hl| {
+                let desc: PersistentLayerDesc = hl.name.clone().into();
+                let layer = guard.try_get_from_key(&desc.key())?;

-                    if layer.visibility() == LayerVisibilityHint::Covered {
-                        return None;
-                    }
+                if layer.visibility() == LayerVisibilityHint::Covered {
+                    return None;
+                }

-                    if layer.is_likely_resident() {
-                        return None;
-                    }
+                if layer.is_likely_resident() {
+                    return None;
+                }

-                    if layer.last_evicted_at().happened_after(*read_at) {
-                        return None;
-                    }
+                if layer.last_evicted_at().happened_after(*read_at) {
+                    return None;
+                }

-                    Some((desc, hl.metadata.clone(), hl.access_time))
-                }))
-            }
+                Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
+            })),
            Some(PreviousHeatmap::Obsolete) => None,
            None => None,
        };
@@ -3627,6 +3680,7 @@ impl Timeline {
                        layer.layer_desc().clone(),
                        layer.metadata(),
                        last_activity_ts,
+                        false, // these layers are not cold
                    ))
                }
                LayerVisibilityHint::Covered => {
@@ -3653,12 +3707,14 @@ impl Timeline {
        // Sort layers in order of which to download first.  For a large set of layers to download, we
        // want to prioritize those layers which are most likely to still be in the resident many minutes
        // or hours later:
+        // - Cold layers go last for convenience when a human inspects the heatmap.
        // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
        //   only exist for a few minutes before being compacted into L1s.
        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
        //   the layer is likely to be covered by an image layer during compaction.
-        layers.sort_by_key(|(desc, _meta, _atime)| {
+        layers.sort_by_key(|(desc, _meta, _atime, cold)| {
            std::cmp::Reverse((
+                *cold,
                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
                desc.lsn_range.end,
            ))
@@ -3666,7 +3722,9 @@ impl Timeline {

        let layers = layers
            .into_iter()
-            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
+            .map(|(desc, meta, atime, cold)| {
+                HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
+            })
            .collect();

        Some(HeatMapTimeline::new(self.timeline_id, layers))
@@ -3686,6 +3744,7 @@ impl Timeline {
                name: vl.layer_desc().layer_name(),
                metadata: vl.metadata(),
                access_time: now,
+                cold: true,
            };
            heatmap_layers.push(hl);
        }
@@ -3699,6 +3758,7 @@ impl Timeline {
        PreviousHeatmap::Active {
            heatmap,
            read_at: Instant::now(),
+            end_lsn: Some(end_lsn),
        }
    }

@@ -3897,39 +3957,22 @@ impl Timeline {
                let guard = timeline.layers.read().await;
                let layers = guard.layer_map()?;

-                let in_memory_layer = layers.find_in_memory_layer(|l| {
-                    let start_lsn = l.get_lsn_range().start;
-                    cont_lsn > start_lsn
-                });
+                for range in unmapped_keyspace.ranges.iter() {
+                    let results = layers.range_search(range.clone(), cont_lsn);

-                match in_memory_layer {
-                    Some(l) => {
-                        let lsn_range = l.get_lsn_range().start..cont_lsn;
-                        fringe.update(
-                            ReadableLayer::InMemoryLayer(l),
-                            unmapped_keyspace.clone(),
-                            lsn_range,
-                        );
-                    }
-                    None => {
-                        for range in unmapped_keyspace.ranges.iter() {
-                            let results = layers.range_search(range.clone(), cont_lsn);
-
-                            results
-                                .found
-                                .into_iter()
-                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                    (
-                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                        keyspace_accum.to_keyspace(),
-                                        lsn_floor..cont_lsn,
-                                    )
-                                })
-                                .for_each(|(layer, keyspace, lsn_range)| {
-                                    fringe.update(layer, keyspace, lsn_range)
-                                });
-                        }
-                    }
+                    results
+                        .found
+                        .into_iter()
+                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                            (
+                                guard.upgrade(layer),
+                                keyspace_accum.to_keyspace(),
+                                lsn_floor..cont_lsn,
+                            )
+                        })
+                        .for_each(|(layer, keyspace, lsn_range)| {
+                            fringe.update(layer, keyspace, lsn_range)
+                        });
                }

                // It's safe to drop the layer map lock after planning the next round of reads.
@@ -4202,10 +4245,6 @@ impl Timeline {

                // Stall flushes to backpressure if compaction can't keep up. This is propagated up
                // to WAL ingestion by having ephemeral layer rolls wait for flushes.
-                //
-                // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so
-                // we can end up stalling before compaction even starts. Consider making it more
-                // responsive (e.g. via `watch_level0_deltas`).
                if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() {
                    if l0_count >= stall_threshold {
                        warn!(
@@ -4693,10 +4732,7 @@ impl Timeline {
            ));
        }

-        let (dense_ks, sparse_ks) = self
-            .collect_keyspace(lsn, ctx)
-            .await
-            .map_err(CompactionError::CollectKeySpaceError)?;
+        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
        let sparse_partitioning = SparseKeyPartitioning {
            parts: vec![sparse_ks],
@@ -5423,13 +5459,42 @@ pub(crate) enum CompactionError {
    Offload(OffloadError),
    /// Compaction cannot be done right now; page reconstruction and so on.
    #[error("Failed to collect keyspace: {0}")]
-    CollectKeySpaceError(CollectKeySpaceError),
+    CollectKeySpaceError(#[from] CollectKeySpaceError),
    #[error(transparent)]
    Other(anyhow::Error),
    #[error("Compaction already running: {0}")]
    AlreadyRunning(&'static str),
 }

+impl CompactionError {
+    /// Errors that can be ignored, i.e., cancel and shutdown.
+    pub fn is_cancel(&self) -> bool {
+        matches!(
+            self,
+            Self::ShuttingDown
+                | Self::AlreadyRunning(_)
+                | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
+                | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
+                    PageReconstructError::Cancelled
+                ))
+                | Self::Offload(OffloadError::Cancelled)
+        )
+    }
+
+    /// Critical errors that indicate data corruption.
+    pub fn is_critical(&self) -> bool {
+        matches!(
+            self,
+            Self::CollectKeySpaceError(
+                CollectKeySpaceError::Decode(_)
+                    | CollectKeySpaceError::PageRead(
+                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
+                    )
+            )
+        )
+    }
+}
+
 impl From<OffloadError> for CompactionError {
    fn from(e: OffloadError) -> Self {
        match e {
@@ -5439,18 +5504,6 @@ impl From<OffloadError> for CompactionError {
    }
 }

-impl From<CollectKeySpaceError> for CompactionError {
-    fn from(err: CollectKeySpaceError) -> Self {
-        match err {
-            CollectKeySpaceError::Cancelled
-            | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => {
-                CompactionError::ShuttingDown
-            }
-            e => CompactionError::Other(e.into()),
-        }
-    }
-}
-
 impl From<super::upload_queue::NotInitialized> for CompactionError {
    fn from(value: super::upload_queue::NotInitialized) -> Self {
        match value {
@@ -5534,6 +5587,14 @@ pub struct DeltaLayerTestDesc {
    pub data: Vec<(Key, Lsn, Value)>,
 }

+#[cfg(test)]
+#[derive(Clone)]
+pub struct InMemoryLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub data: Vec<(Key, Lsn, Value)>,
+    pub is_open: bool,
+}
+
 #[cfg(test)]
 impl DeltaLayerTestDesc {
    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
@@ -6193,6 +6254,7 @@ impl Timeline {
    pub(crate) async fn spawn_download_all_remote_layers(
        self: Arc<Self>,
        request: DownloadRemoteLayersTaskSpawnRequest,
+        ctx: &RequestContext,
    ) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
        use pageserver_api::models::DownloadRemoteLayersTaskState;

@@ -6213,6 +6275,10 @@ impl Timeline {
        }

        let self_clone = Arc::clone(&self);
+        let task_ctx = ctx.detached_child(
+            TaskKind::DownloadAllRemoteLayers,
+            DownloadBehavior::Download,
+        );
        let task_id = task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
@@ -6220,7 +6286,7 @@ impl Timeline {
            Some(self.timeline_id),
            "download all remote layers task",
            async move {
-                self_clone.download_all_remote_layers(request).await;
+                self_clone.download_all_remote_layers(request, &task_ctx).await;
                let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
                 match &mut *status_guard {
                    None => {
@@ -6255,6 +6321,7 @@ impl Timeline {
    async fn download_all_remote_layers(
        self: &Arc<Self>,
        request: DownloadRemoteLayersTaskSpawnRequest,
+        ctx: &RequestContext,
    ) {
        use pageserver_api::models::DownloadRemoteLayersTaskState;

@@ -6311,9 +6378,10 @@ impl Timeline {

                let span = tracing::info_span!("download", layer = %next);

+                let ctx = ctx.attached_child();
                js.spawn(
                    async move {
-                        let res = next.download().await;
+                        let res = next.download(&ctx).await;
                        (next, res)
                    }
                    .instrument(span),
@@ -6541,6 +6609,92 @@ impl Timeline {
        Ok(())
    }

+    /// Force create an in-memory layer and place them into the layer map.
+    #[cfg(test)]
+    pub(super) async fn force_create_in_memory_layer(
+        self: &Arc<Timeline>,
+        mut in_memory: InMemoryLayerTestDesc,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+
+        // Validate LSNs
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(in_memory.lsn_range.start >= check_start_lsn);
+        }
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let layer_end_lsn = if in_memory.is_open {
+            in_memory
+                .data
+                .iter()
+                .map(|(_key, lsn, _value)| lsn)
+                .max()
+                .cloned()
+        } else {
+            Some(in_memory.lsn_range.end)
+        };
+
+        if let Some(end) = layer_end_lsn {
+            assert!(
+                end <= last_record_lsn,
+                "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+                end,
+                last_record_lsn,
+            );
+        }
+
+        in_memory.data.iter().for_each(|(_key, lsn, _value)| {
+            assert!(*lsn >= in_memory.lsn_range.start);
+            assert!(*lsn < in_memory.lsn_range.end);
+        });
+
+        // Build the batch
+        in_memory
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+
+        let data = in_memory
+            .data
+            .into_iter()
+            .map(|(key, lsn, value)| {
+                let value_size = value.serialized_size().unwrap() as usize;
+                (key.to_compact(), lsn, value_size, value)
+            })
+            .collect::<Vec<_>>();
+
+        let batch = SerializedValueBatch::from_values(data);
+
+        // Create the in-memory layer and write the batch into it
+        let layer = InMemoryLayer::create(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            in_memory.lsn_range.start,
+            &self.gate,
+            ctx,
+        )
+        .await
+        .unwrap();
+
+        layer.put_batch(batch, ctx).await.unwrap();
+        if !in_memory.is_open {
+            layer.freeze(in_memory.lsn_range.end).await;
+        }
+
+        info!("force created in-memory layer {:?}", in_memory.lsn_range);
+
+        // Link the layer to the layer map
+        {
+            let mut guard = self.layers.write().await;
+            let layer_map = guard.open_mut().unwrap();
+            layer_map.force_insert_in_memory_layer(Arc::new(layer));
+        }
+
+        Ok(())
+    }
+
    /// Return all keys at the LSN in the image layers
    #[cfg(test)]
    pub(crate) async fn inspect_image_layers(
@@ -6900,11 +7054,13 @@ mod tests {

    use pageserver_api::key::Key;
    use pageserver_api::value::Value;
+    use std::iter::Iterator;
    use tracing::Instrument;
    use utils::id::TimelineId;
    use utils::lsn::Lsn;

    use super::HeatMapTimeline;
+    use crate::context::RequestContextBuilder;
    use crate::tenant::harness::{TenantHarness, test_img};
    use crate::tenant::layer_map::LayerMap;
    use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
@@ -6912,8 +7068,8 @@ mod tests {
    use crate::tenant::{PreviousHeatmap, Timeline};

    fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
-        assert_eq!(lhs.layers.len(), rhs.layers.len());
-        let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
+        assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
+        let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
        for (l, r) in lhs_rhs {
            assert_eq!(l.name, r.name);
            assert_eq!(l.metadata, r.metadata);
@@ -6972,12 +7128,14 @@ mod tests {
                Lsn(0x10),
                14,
                &ctx,
+                Vec::new(), // in-memory layers
                delta_layers,
                image_layers,
                Lsn(0x100),
            )
            .await
            .unwrap();
+        let ctx = &ctx.with_scope_timeline(&timeline);

        // Layer visibility is an input to heatmap generation, so refresh it first
        timeline.update_layer_visibility().await.unwrap();
@@ -6990,10 +7148,11 @@ mod tests {
        assert_eq!(heatmap.timeline_id, timeline.timeline_id);

        // L0 should come last
-        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
+        let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
+        assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());

        let mut last_lsn = Lsn::MAX;
-        for layer in &heatmap.layers {
+        for layer in heatmap_layers {
            // Covered layer should be omitted
            assert!(layer.name != covered_delta.layer_name());

@@ -7026,6 +7185,7 @@ mod tests {
            .store(Some(Arc::new(PreviousHeatmap::Active {
                heatmap: heatmap.clone(),
                read_at: std::time::Instant::now(),
+                end_lsn: None,
            })));

        // Generate a new heatmap and assert that it contains the same layers as the old one.
@@ -7041,8 +7201,12 @@ mod tests {

            eprintln!("Downloading {layer} and re-generating heatmap");

+            let ctx = &RequestContextBuilder::extend(ctx)
+                .download_behavior(crate::context::DownloadBehavior::Download)
+                .build();
+
            let _resident = layer
-                .download_and_keep_resident()
+                .download_and_keep_resident(ctx)
                .instrument(tracing::info_span!(
                    parent: None,
                    "download_layer",
@@ -7100,6 +7264,7 @@ mod tests {
                Lsn(0x10),
                14,
                &ctx,
+                Vec::new(), // in-memory layers
                delta_layers,
                image_layers,
                Lsn(0x100),
@@ -7116,7 +7281,7 @@ mod tests {
            .expect("Infallible while timeline is not shut down");

        // Both layers should be in the heatmap
-        assert!(!heatmap.layers.is_empty());
+        assert!(heatmap.all_layers().count() > 0);

        // Now simulate a migration.
        timeline
@@ -7124,6 +7289,7 @@ mod tests {
            .store(Some(Arc::new(PreviousHeatmap::Active {
                heatmap: heatmap.clone(),
                read_at: std::time::Instant::now(),
+                end_lsn: None,
            })));

        // Evict all the layers in the previous heatmap
@@ -7141,7 +7307,7 @@ mod tests {
            .await
            .expect("Infallible while timeline is not shut down");

-        assert!(post_eviction_heatmap.layers.is_empty());
+        assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
        assert!(matches!(
            timeline.previous_heatmap.load().as_deref(),
            Some(PreviousHeatmap::Obsolete)
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,11 +7,20 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
+use std::time::Instant;

-use anyhow::{Context, anyhow, bail};
+use super::layer_manager::LayerManager;
+use super::{
+    CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
+    GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
+    Timeline,
+};
+
+use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
@@ -31,15 +40,8 @@ use utils::critical;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

-use super::layer_manager::LayerManager;
-use super::{
-    CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
-    GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError,
-    RecordedDuration, Timeline,
-};
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::pgdatadir_mapping::CollectKeySpaceError;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -213,30 +215,39 @@ impl GcCompactionQueue {
    }

    /// Trigger an auto compaction.
-    pub async fn trigger_auto_compaction(&self, timeline: &Arc<Timeline>) {
+    pub async fn trigger_auto_compaction(
+        &self,
+        timeline: &Arc<Timeline>,
+    ) -> Result<(), CompactionError> {
        let GcCompactionCombinedSettings {
            gc_compaction_enabled,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
        } = timeline.get_gc_compaction_settings();
        if !gc_compaction_enabled {
-            return;
+            return Ok(());
        }
        if self.remaining_jobs_num() > 0 {
            // Only schedule auto compaction when the queue is empty
-            return;
+            return Ok(());
        }
        if timeline.ancestor_timeline().is_some() {
            // Do not trigger auto compaction for child timelines. We haven't tested
            // it enough in staging yet.
-            return;
+            return Ok(());
+        }
+        if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
+            // If the gc watermark is not set, we don't need to trigger auto compaction.
+            // This check is the same as in `gc_compaction_split_jobs` but we don't log
+            // here and we can also skip the computation of the trigger condition earlier.
+            return Ok(());
        }

        let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
            // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
            // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger`
            // to ensure the fairness while avoid starving other tasks.
-            return;
+            return Ok(());
        };

        let gc_compaction_state = timeline.get_gc_compaction_state();
@@ -246,7 +257,7 @@ impl GcCompactionQueue {

        let layers = {
            let guard = timeline.layers.read().await;
-            let layer_map = guard.layer_map().unwrap();
+            let layer_map = guard.layer_map()?;
            layer_map.iter_historic_layers().collect_vec()
        };
        let mut l2_size: u64 = 0;
@@ -318,11 +329,12 @@ impl GcCompactionQueue {
                l1_size, l2_size, l2_lsn, gc_cutoff
            );
        } else {
-            info!(
+            debug!(
                "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
                l1_size, l2_size, l2_lsn, gc_cutoff
            );
        }
+        Ok(())
    }

    /// Notify the caller the job has finished and unblock GC.
@@ -353,8 +365,7 @@ impl GcCompactionQueue {
                GcCompactJob::from_compact_options(options.clone()),
                options.sub_compaction_max_job_size_mb,
            )
-            .await
-            .map_err(CompactionError::Other)?;
+            .await?;
        if jobs.is_empty() {
            info!("no jobs to run, skipping scheduled compaction task");
            self.notify_and_unblock(id);
@@ -433,6 +444,7 @@ impl GcCompactionQueue {
            ));
        };
        let has_pending_tasks;
+        let mut yield_for_l0 = false;
        let Some((id, item)) = ({
            let mut guard = self.inner.lock().unwrap();
            if let Some((id, item)) = guard.queued.pop_front() {
@@ -444,7 +456,7 @@ impl GcCompactionQueue {
                None
            }
        }) else {
-            self.trigger_auto_compaction(timeline).await;
+            self.trigger_auto_compaction(timeline).await?;
            // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we
            // have not implemented preemption mechanism yet. We always want to yield it to more important
            // tasks if there is one.
@@ -482,13 +494,23 @@ impl GcCompactionQueue {
                        let mut guard = self.inner.lock().unwrap();
                        guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                    }
-                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                    let compaction_result =
+                        timeline.compact_with_options(cancel, options, ctx).await?;
                    self.notify_and_unblock(id);
+                    if compaction_result == CompactionOutcome::YieldForL0 {
+                        yield_for_l0 = true;
+                    }
                }
            }
            GcCompactionQueueItem::SubCompactionJob(options) => {
                // TODO: error handling, clear the queue if any task fails?
-                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
+                if compaction_result == CompactionOutcome::YieldForL0 {
+                    // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
+                    // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
+                    // we need to clean things up before returning from the function.
+                    yield_for_l0 = true;
+                }
            }
            GcCompactionQueueItem::Notify(id, l2_lsn) => {
                self.notify_and_unblock(id);
@@ -517,7 +539,10 @@ impl GcCompactionQueue {
            let mut guard = self.inner.lock().unwrap();
            guard.running = None;
        }
-        Ok(if has_pending_tasks {
+        Ok(if yield_for_l0 {
+            tracing::info!("give up gc-compaction: yield for L0 compaction");
+            CompactionOutcome::YieldForL0
+        } else if has_pending_tasks {
            CompactionOutcome::Pending
        } else {
            CompactionOutcome::Done
@@ -716,17 +741,41 @@ struct CompactionStatisticsNumSize {

 #[derive(Debug, Serialize, Default)]
 pub struct CompactionStatistics {
+    /// Delta layer visited (maybe compressed, physical size)
    delta_layer_visited: CompactionStatisticsNumSize,
+    /// Image layer visited (maybe compressed, physical size)
    image_layer_visited: CompactionStatisticsNumSize,
+    /// Delta layer produced (maybe compressed, physical size)
    delta_layer_produced: CompactionStatisticsNumSize,
+    /// Image layer produced (maybe compressed, physical size)
    image_layer_produced: CompactionStatisticsNumSize,
-    num_delta_layer_discarded: usize,
-    num_image_layer_discarded: usize,
+    /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    delta_layer_discarded: CompactionStatisticsNumSize,
+    /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    image_layer_discarded: CompactionStatisticsNumSize,
    num_unique_keys_visited: usize,
+    /// Delta visited (uncompressed, original size)
    wal_keys_visited: CompactionStatisticsNumSize,
+    /// Image visited (uncompressed, original size)
    image_keys_visited: CompactionStatisticsNumSize,
+    /// Delta produced (uncompressed, original size)
    wal_produced: CompactionStatisticsNumSize,
+    /// Image produced (uncompressed, original size)
    image_produced: CompactionStatisticsNumSize,
+
+    // Time spent in each phase
+    time_acquire_lock_secs: f64,
+    time_analyze_secs: f64,
+    time_download_layer_secs: f64,
+    time_main_loop_secs: f64,
+    time_final_phase_secs: f64,
+    time_total_secs: f64,
+
+    // Summary
+    /// Ratio of the key-value size before/after gc-compaction.
+    uncompressed_size_ratio: f64,
+    /// Ratio of the physical size before/after gc-compaction.
+    physical_size_ratio: f64,
 }

 impl CompactionStatistics {
@@ -776,11 +825,13 @@ impl CompactionStatistics {
        self.image_produced.num += 1;
        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
    }
-    fn discard_delta_layer(&mut self) {
-        self.num_delta_layer_discarded += 1;
+    fn discard_delta_layer(&mut self, original_size: u64) {
+        self.delta_layer_discarded.num += 1;
+        self.delta_layer_discarded.size += original_size;
    }
-    fn discard_image_layer(&mut self) {
-        self.num_image_layer_discarded += 1;
+    fn discard_image_layer(&mut self, original_size: u64) {
+        self.image_layer_discarded.num += 1;
+        self.image_layer_discarded.size += original_size;
    }
    fn produce_delta_layer(&mut self, size: u64) {
        self.delta_layer_produced.num += 1;
@@ -790,6 +841,19 @@ impl CompactionStatistics {
        self.image_layer_produced.num += 1;
        self.image_layer_produced.size += size;
    }
+    fn finalize(&mut self) {
+        let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
+        let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
+        self.uncompressed_size_ratio =
+            original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
+        let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
+        let produced_physical_size = self.image_layer_produced.size
+            + self.delta_layer_produced.size
+            + self.image_layer_discarded.size
+            + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
+        self.physical_size_ratio =
+            original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
+    }
 }

 #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
@@ -822,9 +886,7 @@ impl Timeline {
            .flags
            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
        {
-            self.compact_with_gc(cancel, options, ctx)
-                .await
-                .map_err(CompactionError::Other)?;
+            self.compact_with_gc(cancel, options, ctx).await?;
            return Ok(CompactionOutcome::Done);
        }

@@ -976,18 +1038,12 @@ impl Timeline {

            // Suppress errors when cancelled.
            Err(_) if self.cancel.is_cancelled() => {}
-            Err(CompactionError::ShuttingDown) => {}
-            Err(CompactionError::CollectKeySpaceError(CollectKeySpaceError::Cancelled)) => {}
+            Err(err) if err.is_cancel() => {}

            // Alert on critical errors that indicate data corruption.
-            Err(
-                err @ CompactionError::CollectKeySpaceError(
-                    CollectKeySpaceError::Decode(_)
-                    | CollectKeySpaceError::PageRead(
-                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
-                    ),
-                ),
-            ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"),
+            Err(err) if err.is_critical() => {
+                critical!("could not compact, repartitioning keyspace failed: {err:?}");
+            }

            // Log other errors. No partitioning? This is normal, if the timeline was just created
            // as an empty timeline. Also in unit tests, when we use the timeline as a simple
@@ -1161,7 +1217,7 @@ impl Timeline {
            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
            //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer.download_and_keep_resident().await?;
+            let resident = layer.download_and_keep_resident(ctx).await?;

            let keys_written = resident
                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
@@ -1389,14 +1445,14 @@ impl Timeline {

        let mut fully_compacted = true;

-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?);
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;

            if lsn_range.start != prev_lsn_end {
                break;
            }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact.push(l.download_and_keep_resident(ctx).await?);
            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;

@@ -2350,12 +2406,19 @@ impl Timeline {
    async fn check_compaction_space(
        self: &Arc<Self>,
        layer_selection: &[Layer],
-    ) -> anyhow::Result<()> {
-        let available_space = self.check_available_space().await?;
+    ) -> Result<(), CompactionError> {
+        let available_space = self
+            .check_available_space()
+            .await
+            .map_err(CompactionError::Other)?;
        let mut remote_layer_size = 0;
        let mut all_layer_size = 0;
        for layer in layer_selection {
-            let needs_download = layer.needs_download().await?;
+            let needs_download = layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?;
            if needs_download.is_some() {
                remote_layer_size += layer.layer_desc().file_size;
            }
@@ -2364,14 +2427,14 @@ impl Timeline {
        let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
        if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
        {
-            return Err(anyhow!(
+            return Err(CompactionError::Other(anyhow!(
                "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
                available_space,
                allocated_space,
                all_layer_size,
                remote_layer_size,
                all_layer_size + remote_layer_size
-            ));
+            )));
        }
        Ok(())
    }
@@ -2402,7 +2465,7 @@ impl Timeline {
        self: &Arc<Self>,
        job: GcCompactJob,
        sub_compaction_max_job_size_mb: Option<u64>,
-    ) -> anyhow::Result<Vec<GcCompactJob>> {
+    ) -> Result<Vec<GcCompactJob>, CompactionError> {
        let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
            job.compact_lsn_range.end
        } else {
@@ -2553,7 +2616,7 @@ impl Timeline {
        cancel: &CancellationToken,
        options: CompactOptions,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        let sub_compaction = options.sub_compaction;
        let job = GcCompactJob::from_compact_options(options.clone());
        if sub_compaction {
@@ -2575,7 +2638,7 @@ impl Timeline {
            if jobs_len == 0 {
                info!("no jobs to run, skipping gc bottom-most compaction");
            }
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
        }
        self.compact_with_gc_inner(cancel, job, ctx).await
    }
@@ -2585,19 +2648,24 @@ impl Timeline {
        cancel: &CancellationToken,
        job: GcCompactJob,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
        // Note that we already acquired the compaction lock when the outer `compact` function gets called.

+        let timer = Instant::now();
+        let begin_timer = timer;
+
        let gc_lock = async {
            tokio::select! {
                guard = self.gc_lock.lock() => Ok(guard),
-                // TODO: refactor to CompactionError to correctly pass cancelled error
-                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
+                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
            }
        };

+        let time_acquire_lock = timer.elapsed();
+        let timer = Instant::now();
+
        let gc_lock = crate::timed(
            gc_lock,
            "acquires gc lock",
@@ -2649,7 +2717,7 @@ impl Timeline {
                        tracing::warn!(
                            "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
                        );
-                        return Ok(());
+                        return Ok(CompactionOutcome::Skipped);
                    }
                    real_gc_cutoff
                } else {
@@ -2687,7 +2755,7 @@ impl Timeline {
                    "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
                    gc_cutoff
                );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
            };
            // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
            // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
@@ -2708,7 +2776,7 @@ impl Timeline {
                    "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
                    compact_lsn_range.end
                );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
            // layers to compact.
@@ -2734,7 +2802,7 @@ impl Timeline {
                    "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
                    gc_cutoff, compact_key_range.start, compact_key_range.end
                );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
            }
            retain_lsns_below_horizon.sort();
            GcCompactionJobDescription {
@@ -2787,6 +2855,9 @@ impl Timeline {
            has_data_below,
        );

+        let time_analyze = timer.elapsed();
+        let timer = Instant::now();
+
        for layer in &job_desc.selected_layers {
            debug!("read layer: {}", layer.layer_desc().key());
        }
@@ -2815,10 +2886,10 @@ impl Timeline {
            .map(|layer| layer.layer_desc().layer_name())
            .collect_vec();
        if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
                err
-            );
+            )));
        }
        // The maximum LSN we are processing in this compaction loop
        let end_lsn = job_desc
@@ -2833,11 +2904,33 @@ impl Timeline {
        let mut total_downloaded_size = 0;
        let mut total_layer_size = 0;
        for layer in &job_desc.selected_layers {
-            if layer.needs_download().await?.is_some() {
+            if layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?
+                .is_some()
+            {
                total_downloaded_size += layer.layer_desc().file_size;
            }
            total_layer_size += layer.layer_desc().file_size;
-            let resident_layer = layer.download_and_keep_resident().await?;
+            if cancel.is_cancelled() {
+                return Err(CompactionError::ShuttingDown);
+            }
+            let should_yield = self
+                .l0_compaction_trigger
+                .notified()
+                .now_or_never()
+                .is_some();
+            if should_yield {
+                tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
+            }
+            let resident_layer = layer
+                .download_and_keep_resident(ctx)
+                .await
+                .context("failed to download and keep resident layer")
+                .map_err(CompactionError::Other)?;
            downloaded_layers.push(resident_layer);
        }
        info!(
@@ -2848,19 +2941,36 @@ impl Timeline {
        );
        for resident_layer in &downloaded_layers {
            if resident_layer.layer_desc().is_delta() {
-                let layer = resident_layer.get_as_delta(ctx).await?;
+                let layer = resident_layer
+                    .get_as_delta(ctx)
+                    .await
+                    .context("failed to get delta layer")
+                    .map_err(CompactionError::Other)?;
                delta_layers.push(layer);
            } else {
-                let layer = resident_layer.get_as_image(ctx).await?;
+                let layer = resident_layer
+                    .get_as_image(ctx)
+                    .await
+                    .context("failed to get image layer")
+                    .map_err(CompactionError::Other)?;
                image_layers.push(layer);
            }
        }
-        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_gc_compaction_keyspace()
+            .await
+            .context("failed to collect gc compaction keyspace")
+            .map_err(CompactionError::Other)?;
        let mut merge_iter = FilterIterator::create(
            MergeIterator::create(&delta_layers, &image_layers, ctx),
            dense_ks,
            sparse_ks,
-        )?;
+        )
+        .context("failed to create filter iterator")
+        .map_err(CompactionError::Other)?;
+
+        let time_download_layer = timer.elapsed();
+        let timer = Instant::now();

        // Step 2: Produce images+deltas.
        let mut accumulated_values = Vec::new();
@@ -2880,7 +2990,9 @@ impl Timeline {
                    &self.gate,
                    ctx,
                )
-                .await?,
+                .await
+                .context("failed to create image layer writer")
+                .map_err(CompactionError::Other)?,
            )
        } else {
            None
@@ -2893,7 +3005,9 @@ impl Timeline {
            lowest_retain_lsn..end_lsn,
            self.get_compaction_target_size(),
        )
-        .await?;
+        .await
+        .context("failed to create delta layer writer")
+        .map_err(CompactionError::Other)?;

        #[derive(Default)]
        struct RewritingLayers {
@@ -2933,9 +3047,28 @@ impl Timeline {
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

-        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
+        let mut keys_processed = 0;
+
+        while let Some(((key, lsn, val), desc)) = merge_iter
+            .next_with_trace()
+            .await
+            .context("failed to get next key-value pair")
+            .map_err(CompactionError::Other)?
+        {
            if cancel.is_cancelled() {
-                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+                return Err(CompactionError::ShuttingDown);
+            }
+            keys_processed += 1;
+            if keys_processed % 1000 == 0 {
+                let should_yield = self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some();
+                if should_yield {
+                    tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
+                    return Ok(CompactionOutcome::YieldForL0);
+                }
            }
            if self.shard_identity.is_key_disposable(&key) {
                // If this shard does not need to store this key, simply skip it.
@@ -2967,7 +3100,9 @@ impl Timeline {
                                &self.gate,
                                ctx,
                            )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                        );
                    }
                    rewriter.before.as_mut().unwrap()
@@ -2983,14 +3118,20 @@ impl Timeline {
                                &self.gate,
                                ctx,
                            )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                        );
                    }
                    rewriter.after.as_mut().unwrap()
                } else {
                    unreachable!()
                };
-                rewriter.put_value(key, lsn, val, ctx).await?;
+                rewriter
+                    .put_value(key, lsn, val, ctx)
+                    .await
+                    .context("failed to put value")
+                    .map_err(CompactionError::Other)?;
                continue;
            }
            match val {
@@ -3013,9 +3154,13 @@ impl Timeline {
                        &job_desc.retain_lsns_below_horizon,
                        COMPACTION_DELTA_THRESHOLD,
                        get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
-                            .await?,
+                            .await
+                            .context("failed to get ancestor image")
+                            .map_err(CompactionError::Other)?,
                    )
-                    .await?;
+                    .await
+                    .context("failed to generate key retention")
+                    .map_err(CompactionError::Other)?;
                retention
                    .pipe_to(
                        *last_key,
@@ -3025,7 +3170,9 @@ impl Timeline {
                        &self.gate,
                        ctx,
                    )
-                    .await?;
+                    .await
+                    .context("failed to pipe to delta layer writer")
+                    .map_err(CompactionError::Other)?;
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
@@ -3043,9 +3190,14 @@ impl Timeline {
                job_desc.gc_cutoff,
                &job_desc.retain_lsns_below_horizon,
                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
+                    .await
+                    .context("failed to get ancestor image")
+                    .map_err(CompactionError::Other)?,
            )
-            .await?;
+            .await
+            .context("failed to generate key retention")
+            .map_err(CompactionError::Other)?;
        retention
            .pipe_to(
                last_key,
@@ -3055,21 +3207,36 @@ impl Timeline {
                &self.gate,
                ctx,
            )
-            .await?;
+            .await
+            .context("failed to pipe to delta layer writer")
+            .map_err(CompactionError::Other)?;
        // end: move the above part to the loop body

+        let time_main_loop = timer.elapsed();
+        let timer = Instant::now();
+
        let mut rewrote_delta_layers = Vec::new();
        for (key, writers) in delta_layer_rewriters {
            if let Some(delta_writer_before) = writers.before {
                let (desc, path) = delta_writer_before
                    .finish(job_desc.compaction_key_range.start, ctx)
-                    .await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                rewrote_delta_layers.push(layer);
            }
            if let Some(delta_writer_after) = writers.after {
-                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                let (desc, path) = delta_writer_after
+                    .finish(key.key_range.end, ctx)
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                rewrote_delta_layers.push(layer);
            }
        }
@@ -3084,7 +3251,9 @@ impl Timeline {
                let end_key = job_desc.compaction_key_range.end;
                writer
                    .finish_with_discard_fn(self, ctx, end_key, discard)
-                    .await?
+                    .await
+                    .context("failed to finish image layer writer")
+                    .map_err(CompactionError::Other)?
            } else {
                drop(writer);
                Vec::new()
@@ -3096,7 +3265,9 @@ impl Timeline {
        let produced_delta_layers = if !dry_run {
            delta_layer_writer
                .finish_with_discard_fn(self, ctx, discard)
-                .await?
+                .await
+                .context("failed to finish delta layer writer")
+                .map_err(CompactionError::Other)?
        } else {
            drop(delta_layer_writer);
            Vec::new()
@@ -3108,6 +3279,13 @@ impl Timeline {
        let mut keep_layers = HashSet::new();
        let produced_delta_layers_len = produced_delta_layers.len();
        let produced_image_layers_len = produced_image_layers.len();
+
+        let layer_selection_by_key = job_desc
+            .selected_layers
+            .iter()
+            .map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
+            .collect::<HashMap<_, _>>();
+
        for action in produced_delta_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
@@ -3121,8 +3299,16 @@ impl Timeline {
                    if cfg!(debug_assertions) {
                        info!("discarded delta layer: {}", l);
                    }
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_delta_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_delta_layer(0);
+                    }
                    keep_layers.insert(l);
-                    stat.discard_delta_layer();
                }
            }
        }
@@ -3131,6 +3317,9 @@ impl Timeline {
                "produced rewritten delta layer: {}",
                layer.layer_desc().key()
            );
+            // For now, we include rewritten delta layer size in the "produce_delta_layer". We could
+            // make it a separate statistics in the future.
+            stat.produce_delta_layer(layer.layer_desc().file_size());
        }
        compact_to.extend(rewrote_delta_layers);
        for action in produced_image_layers {
@@ -3142,8 +3331,16 @@ impl Timeline {
                }
                BatchWriterResult::Discarded(l) => {
                    debug!("discarded image layer: {}", l);
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_image_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_image_layer(0);
+                    }
                    keep_layers.insert(l);
-                    stat.discard_image_layer();
                }
            }
        }
@@ -3176,7 +3373,9 @@ impl Timeline {
                    &layer.layer_desc().key_range,
                    &job_desc.compaction_key_range,
                ) {
-                    bail!("violated constraint: image layer outside of compaction key range");
+                    return Err(CompactionError::Other(anyhow!(
+                        "violated constraint: image layer outside of compaction key range"
+                    )));
                }
                if !fully_contains(
                    &job_desc.compaction_key_range,
@@ -3189,13 +3388,25 @@ impl Timeline {

        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));

+        let time_final_phase = timer.elapsed();
+
+        stat.time_final_phase_secs = time_final_phase.as_secs_f64();
+        stat.time_main_loop_secs = time_main_loop.as_secs_f64();
+        stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
+        stat.time_download_layer_secs = time_download_layer.as_secs_f64();
+        stat.time_analyze_secs = time_analyze.as_secs_f64();
+        stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
+        stat.finalize();
+
        info!(
            "gc-compaction statistics: {}",
-            serde_json::to_string(&stat)?
+            serde_json::to_string(&stat)
+                .context("failed to serialize gc-compaction statistics")
+                .map_err(CompactionError::Other)?
        );

        if dry_run {
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
        }

        info!(
@@ -3230,10 +3441,10 @@ impl Timeline {
        // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
        // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
        if let Some(err) = check_valid_layermap(&final_layers) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
                err
-            );
+            )));
        }

        // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
@@ -3285,7 +3496,9 @@ impl Timeline {
        // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
        // be batched into `schedule_compaction_update`.
        let disk_consistent_lsn = self.disk_consistent_lsn.load();
-        self.schedule_uploads(disk_consistent_lsn, None)?;
+        self.schedule_uploads(disk_consistent_lsn, None)
+            .context("failed to schedule uploads")
+            .map_err(CompactionError::Other)?;
        // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
        // of `compact_from`.
        let compact_from = {
@@ -3312,7 +3525,7 @@ impl Timeline {

        drop(gc_lock);

-        Ok(())
+        Ok(CompactionOutcome::Done)
    }
 }

@@ -3418,6 +3631,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
    async fn downcast_delta_layer(
        &self,
        layer: &OwnArc<PersistentLayerDesc>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
        // this is a lot more complex than a simple downcast...
        if layer.is_delta() {
@@ -3425,7 +3639,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
                let guard = self.timeline.layers.read().await;
                guard.get_from_desc(layer)
            };
-            let result = l.download_and_keep_resident().await?;
+            let result = l.download_and_keep_resident(ctx).await?;

            Ok(Some(ResidentDeltaLayer(result)))
        } else {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -11,6 +11,7 @@ use utils::id::TimelineId;
 use utils::{crashsafe, fs_ext, pausable_failpoint};

 use crate::config::PageServerConf;
+use crate::context::RequestContext;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::{
@@ -291,10 +292,11 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
+        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
-        let timeline = tenant
+        let (timeline, _timeline_ctx) = tenant
            .create_timeline_struct(
                timeline_id,
                local_metadata,
@@ -306,6 +308,8 @@ impl DeleteTimelineFlow {
                CreateTimelineCause::Delete,
                crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
                None, // doesn't matter what we put here
+                None, // doesn't matter what we put here
+                ctx,
            )
            .context("create_timeline_struct")?;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,6 +12,7 @@ use utils::completion;
 use utils::generation::Generation;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
+use utils::sync::gate::GateError;

 use super::layer_manager::LayerManager;
 use super::{FlushLayerError, Timeline};
@@ -363,14 +364,25 @@ pub(super) async fn prepare(

    let mut tasks = tokio::task::JoinSet::new();
    let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
+    let cancel_eval = CancellationToken::new();

    for adopted in rest_of_historic {
        let limiter = limiter.clone();
        let timeline = detached.clone();
+        let cancel_eval = cancel_eval.clone();

        tasks.spawn(
            async move {
-                let _permit = limiter.acquire().await;
+                let _permit = tokio::select! {
+                    permit = limiter.acquire() => {
+                        permit
+                    }
+                    // Wait for the cancellation here instead of letting the entire task be cancelled.
+                    // Cancellations are racy in that they might leave layers on disk.
+                    _ = cancel_eval.cancelled() => {
+                        Err(Error::ShuttingDown)?
+                    }
+                };
                let (owned, did_hardlink) = remote_copy(
                    &adopted,
                    &timeline,
@@ -386,7 +398,22 @@ pub(super) async fn prepare(
        );
    }

+    fn delete_layers(timeline: &Timeline, layers: Vec<Layer>) -> Result<(), Error> {
+        // We are deleting layers, so we must hold the gate
+        let _gate = timeline.gate.enter().map_err(|e| match e {
+            GateError::GateClosed => Error::ShuttingDown,
+        })?;
+        {
+            layers.into_iter().for_each(|l: Layer| {
+                l.delete_on_drop();
+                std::mem::drop(l);
+            });
+        }
+        Ok(())
+    }
+
    let mut should_fsync = false;
+    let mut first_err = None;
    while let Some(res) = tasks.join_next().await {
        match res {
            Ok(Ok((owned, did_hardlink))) => {
@@ -395,13 +422,24 @@ pub(super) async fn prepare(
                }
                new_layers.push(owned);
            }
+
+            // Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete.
            Ok(Err(failed)) => {
-                return Err(failed);
+                cancel_eval.cancel();
+                first_err.get_or_insert(failed);
+            }
+            Err(je) => {
+                cancel_eval.cancel();
+                first_err.get_or_insert(Error::Prepare(je.into()));
            }
-            Err(je) => return Err(Error::Prepare(je.into())),
        }
    }

+    if let Some(failed) = first_err {
+        delete_layers(detached, new_layers)?;
+        return Err(failed);
+    }
+
    // fsync directory again if we hardlinked something
    if should_fsync {
        fsync_timeline_dir(detached, ctx).await;
@@ -592,7 +630,7 @@ async fn copy_lsn_prefix(
    .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}"))
    .map_err(Error::Prepare)?;

-    let resident = layer.download_and_keep_resident().await.map_err(|e| {
+    let resident = layer.download_and_keep_resident(ctx).await.map_err(|e| {
        if e.is_cancelled() {
            Error::ShuttingDown
        } else {
@@ -650,6 +688,11 @@ async fn remote_copy(
    let conf = adoptee.conf;
    let file_name = adopted.layer_desc().layer_name();

+    // We don't want to shut the timeline down during this operation because we do `delete_on_drop` below
+    let _gate = adoptee.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => Error::ShuttingDown,
+    })?;
+
    // depending if Layer::keep_resident, do a hardlink
    let did_hardlink;
    let owned = if let Some(adopted_resident) = adopted.keep_resident().await {
@@ -661,8 +704,32 @@ async fn remote_copy(
            &file_name,
            &metadata.generation,
        );
-        std::fs::hard_link(adopted_path, &adoptee_path)
-            .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
+
+        match std::fs::hard_link(adopted_path, &adoptee_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+                // In theory we should not get into this situation as we are doing cleanups of the layer file after errors.
+                // However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch.
+
+                // Double check that the file is orphan (probably from an earlier attempt), then delete it
+                let key = file_name.clone().into();
+                if adoptee.layers.read().await.contains_key(&key) {
+                    // We are supposed to filter out such cases before coming to this function
+                    return Err(Error::Prepare(anyhow::anyhow!(
+                        "layer file {file_name} already present and inside layer map"
+                    )));
+                }
+                tracing::info!("Deleting orphan layer file to make way for hard linking");
+                // Delete orphan layer file and try again, to ensure this layer has a well understood source
+                std::fs::remove_file(adopted_path)
+                    .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
+                std::fs::hard_link(adopted_path, &adoptee_path)
+                    .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
+            }
+            Err(e) => {
+                return Err(Error::launder(e.into(), Error::Prepare));
+            }
+        };
        did_hardlink = true;
        Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard()
    } else {
@@ -670,12 +737,21 @@ async fn remote_copy(
        Layer::for_evicted(conf, adoptee, file_name, metadata)
    };

-    let layer = adoptee
+    let layer = match adoptee
        .remote_client
        .copy_timeline_layer(adopted, &owned, cancel)
        .await
-        .map(move |()| owned)
-        .map_err(|e| Error::launder(e, Error::Prepare))?;
+    {
+        Ok(()) => owned,
+        Err(e) => {
+            {
+                // Clean up the layer so that on a retry we don't get errors that the file already exists
+                owned.delete_on_drop();
+                std::mem::drop(owned);
+            }
+            return Err(Error::launder(e, Error::Prepare));
+        }
+    };

    Ok((layer, did_hardlink))
 }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -93,7 +93,8 @@ impl Timeline {
            }
        }

-        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
+        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn)
+            .with_scope_timeline(&self);
        loop {
            let policy = self.get_eviction_policy();
            let cf = self
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,5 +1,4 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
+//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
 //!
 //! # Motivation
 //!
@@ -19,27 +18,32 @@
 //! we hold the Timeline gate open while we're invoking the method on the
 //! Timeline object.
 //!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
+//! We want to avoid the overhead of doing, for each incoming request,
+//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//!   release the mgr rwlock before doing any request processing work
+//! - re-entering the Timeline gate for each Timeline method invocation.
 //!
 //! Regardless of how we accomplish the above, it should not
 //! prevent the Timeline from shutting down promptly.
 //!
+//!
 //! # Design
 //!
 //! ## Data Structures
 //!
-//! There are three user-facing data structures:
+//! There are two concepts expressed as associated types in the `Types` trait:
+//! - `TenantManager`: the thing that performs the expensive work. It produces
+//!   a `Timeline` object, which is the other associated type.
+//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
+//!
+//! There are three user-facing data structures exposed by this module:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
+//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
 //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
-//!   trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
+//!   trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
+//!   point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
 //!
 //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
 //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
@@ -64,11 +68,14 @@
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
-//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
+//! A cache miss means we call Types::TenantManager::resolve for shard routing,
+//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
+//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
+//!
+//! We wrap the object returned from resolve() in an `Arc` and store that inside the
+//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
 //! and a strong ref in the `PerTimelineState`.
-//! A strong ref is returned wrapped in a `Handle`.
+//! Another strong ref is returned wrapped in a `Handle`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
 //! and find the weak ref in the cache.
@@ -78,51 +85,51 @@
 //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
 //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
 //! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
+//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
 //!
 //! # Performance
 //!
 //! Remember from the introductory section:
 //!
-//! > However, we want to avoid the overhead of entering the gate for every
-//! > method invocation.
+//! > We want to avoid the overhead of doing, for each incoming request,
+//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//! >   release the mgr rwlock before doing any request processing work
+//! > - re-entering the Timeline gate for each Timeline method invocation.
 //!
-//! Why do we want to avoid that?
-//! Because the gate is a shared location in memory and entering it involves
-//! bumping refcounts, which leads to cache contention if done frequently
-//! from multiple cores in parallel.
+//! All of these boil down to some state that is either globally shared among all shards
+//! or state shared among all tasks that serve a particular timeline.
+//! It is either protected by RwLock or manipulated via atomics.
+//! Even atomics are costly when shared across multiple cores.
+//! So, we want to avoid any permanent need for coordination between page_service tasks.
 //!
-//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
-//! That `Arc` is private to the `HandleInner` and hence to the connection.
+//! The solution is to add indirection: we wrap the Types::Timeline object that is
+//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
+//! and hence to the single Cache / page_service connection.
 //! (Review the "Data Structures" section if that is unclear to you.)
 //!
-//! A `WeakHandle` is a weak ref to the `HandleInner`.
-//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
-//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
-//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
 //!
-//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
-//! Again, this is cheap because the `Arc` is private to the connection.
+//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
+//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
+//! The Mutex is not contended because it is private to the connection.
+//! And again, the  `Arc<Types::Timeline>` clone is cheap because that wrapper
+//! Arc's refcounts are private to the connection.
+//!
+//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
 //!
-//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
-//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
-//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
-//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
-//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
-//! so that we can clone it cheaply when upgrading a `WeakHandle`.
 //!
 //! # Shutdown
 //!
 //! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
 //! ```
 //!
 //! Further, there is this cycle:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
 //! ```
 //!
 //! The former cycle is a memory leak if not broken.
@@ -135,9 +142,12 @@
 //! - Timeline shutdown (=> `PerTimelineState::shutdown`)
 //! - Connection shutdown (=> dropping the `Cache`).
 //!
-//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
-//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
-//! `Arc<GateGuard>`.
+//! Both transition the `HandleInner` from [`HandleInner::Open`] to
+//! [`HandleInner::ShutDown`], which drops the only long-lived
+//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
+//! is dropped, the `Types::Timeline` gets dropped and thereby
+//! the `GateGuard` and the `Arc<Timeline>` that it stores,
+//! thereby breaking both cycles.
 //!
 //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
 //! thereby breaking the cycle.
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
 pub(crate) trait Types: Sized + std::fmt::Debug {
    type TenantManagerError: Sized + std::fmt::Debug;
    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
+    type Timeline: Timeline<Self> + Sized;
 }

 /// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {

 /// See module-level comment.
 pub(crate) struct Handle<T: Types> {
-    timeline: Arc<T::Timeline>,
-    #[allow(dead_code)] // the field exists to keep the gate open
-    gate_guard: Arc<utils::sync::gate::GateGuard>,
    inner: Arc<Mutex<HandleInner<T>>>,
+    open: Arc<T::Timeline>,
 }
 pub(crate) struct WeakHandle<T: Types> {
    inner: Weak<Mutex<HandleInner<T>>>,
 }
+
 enum HandleInner<T: Types> {
-    KeepingTimelineGateOpen {
-        #[allow(dead_code)]
-        gate_guard: Arc<utils::sync::gate::GateGuard>,
-        timeline: Arc<T::Timeline>,
-    },
+    Open(Arc<T::Timeline>),
    ShutDown,
 }

@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
 }

 /// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
+pub(crate) trait Timeline<T: Types> {
    fn shard_timeline_id(&self) -> ShardTimelineId;
    fn get_shard_identity(&self) -> &ShardIdentity;
    fn per_timeline_state(&self) -> &PerTimelineState<T>;
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
 #[derive(Debug)]
 pub(crate) enum GetError<T: Types> {
    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
    PerTimelineStateShutDown,
 }

@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
                }

                trace!("creating new HandleInner");
-                let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
-                    gate_guard: Arc::new(
-                        // this enter() is expensive in production code because
-                        // it hits the global Arc<Timeline>::gate refcounts
-                        match timeline.gate().enter() {
-                            Ok(guard) => guard,
-                            Err(_) => {
-                                return Err(GetError::TimelineGateClosed);
-                            }
-                        },
-                    ),
-                    // this clone is expensive in production code because
-                    // it hits the global Arc<Timeline>::clone refcounts
-                    timeline: Arc::new(timeline.clone()),
-                }));
+                let timeline = Arc::new(timeline);
+                let handle_inner_arc =
+                    Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
                let handle_weak = WeakHandle {
                    inner: Arc::downgrade(&handle_inner_arc),
                };
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
        };
        let lock_guard = inner.lock().expect("poisoned");
        match &*lock_guard {
-            HandleInner::KeepingTimelineGateOpen {
-                timeline,
-                gate_guard,
-            } => {
-                let gate_guard = Arc::clone(gate_guard);
-                let timeline = Arc::clone(timeline);
+            HandleInner::Open(open) => {
+                let open = Arc::clone(open);
                drop(lock_guard);
-                Ok(Handle {
-                    timeline,
-                    gate_guard,
-                    inner,
-                })
+                Ok(Handle { open, inner })
            }
            HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
        }
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
 impl<T: Types> std::ops::Deref for Handle<T> {
    type Target = T::Timeline;
    fn deref(&self) -> &Self::Target {
-        &self.timeline
+        &self.open
    }
 }

@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
    /// to the [`Types::Timeline`] that embeds this per-timeline state.
    /// Even if [`TenantManager::resolve`] would still resolve to it.
    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
+    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
    /// That's ok because they're short-lived. See module-level comment for details.
    #[instrument(level = "trace", skip_all)]
    pub(super) fn shutdown(&self) {
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
 impl<T: Types> HandleInner<T> {
    fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
        match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
+            HandleInner::Open(timeline) => Some(timeline),
            HandleInner::ShutDown => {
                // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
                // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
@@ -631,6 +614,7 @@ mod tests {
    use pageserver_api::reltag::RelTag;
    use pageserver_api::shard::ShardStripeSize;
    use utils::shard::ShardCount;
+    use utils::sync::gate::GateGuard;

    use super::*;

@@ -641,7 +625,7 @@ mod tests {
    impl Types for TestTypes {
        type TenantManagerError = anyhow::Error;
        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
+        type Timeline = Entered;
    }

    struct StubManager {
@@ -656,17 +640,19 @@ mod tests {
        myself: Weak<StubTimeline>,
    }

+    struct Entered {
+        timeline: Arc<StubTimeline>,
+        #[allow(dead_code)] // it's stored here to keep the gate open
+        gate_guard: Arc<GateGuard>,
+    }
+
    impl StubTimeline {
        fn getpage(&self) {
            // do nothing
        }
    }

-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
+    impl Timeline<TestTypes> for Entered {
        fn shard_timeline_id(&self) -> ShardTimelineId {
            ShardTimelineId {
                shard_index: self.shard.shard_index(),
@@ -688,20 +674,34 @@ mod tests {
            &self,
            timeline_id: TimelineId,
            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
+        ) -> anyhow::Result<Entered> {
            for timeline in &self.shards {
                if timeline.id == timeline_id {
+                    let enter_gate = || {
+                        let gate_guard = timeline.gate.enter()?;
+                        let gate_guard = Arc::new(gate_guard);
+                        anyhow::Ok(gate_guard)
+                    };
                    match &shard_selector {
                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Zero => continue,
                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Page(_) => continue,
                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Known(_) => continue,
                    }
@@ -711,6 +711,13 @@ mod tests {
        }
    }

+    impl std::ops::Deref for Entered {
+        type Target = StubTimeline;
+        fn deref(&self) -> &Self::Target {
+            &self.timeline
+        }
+    }
+
    #[tokio::test(start_paused = true)]
    async fn test_timeline_shutdown() {
        crate::tenant::harness::setup_logging();
@@ -1038,7 +1045,6 @@ mod tests {
        let key = DBDIR_KEY;

        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
        for _ in 0..10 {
            let mut cache = Cache::<TestTypes>::default();
            let handle = {
@@ -1050,7 +1056,6 @@ mod tests {
                handle
            };
            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.timeline));
        }

        // No handles exist, thus gates are closed and don't require shutdown.
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -10,6 +10,8 @@ use http_utils::error::ApiError;
 use tokio_util::sync::CancellationToken;
 use utils::sync::gate::Gate;

+use crate::context::RequestContext;
+
 use super::Timeline;

 // This status is not strictly necessary now, but gives us a nice place
@@ -30,6 +32,8 @@ impl HeatmapLayersDownloader {
    fn new(
        timeline: Arc<Timeline>,
        concurrency: usize,
+        recurse: bool,
+        ctx: RequestContext,
    ) -> Result<HeatmapLayersDownloader, ApiError> {
        let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;

@@ -57,12 +61,13 @@ impl HeatmapLayersDownloader {

                tracing::info!(
                    resident_size=%timeline.resident_physical_size(),
-                    heatmap_layers=%heatmap.layers.len(),
+                    heatmap_layers=%heatmap.all_layers().count(),
                    "Starting heatmap layers download"
                );

-                let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
+                let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
                    |layer| {
+                        let ctx = ctx.attached_child();
                        let tl = timeline.clone();
                        let dl_guard = match downloads_guard.enter() {
                            Ok(g) => g,
@@ -75,7 +80,7 @@ impl HeatmapLayersDownloader {
                        Some(async move {
                            let _dl_guard = dl_guard;

-                            let res = tl.download_layer(&layer.name).await;
+                            let res = tl.download_layer(&layer.name, &ctx).await;
                            if let Err(err) = res {
                                if !err.is_cancelled() {
                                    tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
@@ -94,6 +99,20 @@ impl HeatmapLayersDownloader {
                    },
                    _ = cancel.cancelled() => {
                        tracing::info!("Heatmap layers download cancelled");
+                        return;
+                    }
+                }
+
+                if recurse {
+                    if let Some(ancestor) = timeline.ancestor_timeline() {
+                        let ctx = ctx.attached_child();
+                        let res =
+                            ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx);
+                        if let Err(err) = res {
+                            tracing::info!(
+                                "Failed to start heatmap layers download for ancestor: {err}"
+                            );
+                        }
                    }
                }
            }
@@ -136,13 +155,20 @@ impl HeatmapLayersDownloader {
 }

 impl Timeline {
-    pub(crate) async fn start_heatmap_layers_download(
+    pub(crate) fn start_heatmap_layers_download(
        self: &Arc<Self>,
        concurrency: usize,
+        recurse: bool,
+        ctx: &RequestContext,
    ) -> Result<(), ApiError> {
        let mut locked = self.heatmap_layers_downloader.lock().unwrap();
        if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
-            let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
+            let dl = HeatmapLayersDownloader::new(
+                self.clone(),
+                concurrency,
+                recurse,
+                ctx.attached_child(),
+            )?;
            *locked = Some(dl);
            Ok(())
        } else {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,14 +8,14 @@ use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};

-use super::TimelineWriterState;
+use super::{ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
 use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
 use crate::tenant::storage_layer::{
    AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
-    PersistentLayerKey, ResidentLayer,
+    PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
 };

 /// Provides semantic APIs to manipulate the layer map.
@@ -37,6 +37,21 @@ impl Default for LayerManager {
 }

 impl LayerManager {
+    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+        match weak {
+            ReadableLayerWeak::PersistentLayer(desc) => {
+                ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
+            }
+            ReadableLayerWeak::InMemoryLayer(desc) => {
+                let inmem = self
+                    .layer_map()
+                    .expect("no concurrent shutdown")
+                    .in_memory_layer(&desc);
+                ReadableLayer::InMemoryLayer(inmem)
+            }
+        }
+    }
+
    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
@@ -470,6 +485,25 @@ impl OpenLayerManager {
        mapping.remove(layer);
        layer.delete_on_drop();
    }
+
+    #[cfg(test)]
+    pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
+        use pageserver_api::models::InMemoryLayerInfo;
+
+        match layer.info() {
+            InMemoryLayerInfo::Open { .. } => {
+                assert!(self.layer_map.open_layer.is_none());
+                self.layer_map.open_layer = Some(layer);
+            }
+            InMemoryLayerInfo::Frozen { lsn_start, .. } => {
+                if let Some(last) = self.layer_map.frozen_layers.back() {
+                    assert!(last.get_lsn_range().end <= lsn_start);
+                }
+
+                self.layer_map.frozen_layers.push_back(layer);
+            }
+        }
+    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -961,7 +961,8 @@ mod tests {
    }

    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let (_temp_dir, pathbuf, offsets) =
            write_maybe_compressed(blobs, compression, &ctx).await?;

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -26,15 +26,14 @@ use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 pub use pageserver_api::models::virtual_file as api;
-use pageserver_api::shard::TenantShardId;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

+use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::RequestContext;
-use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation};
+use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
 use crate::page_cache::{PAGE_SZ, PageWriteGuard};
-use crate::tenant::TENANTS_SEGMENT_NAME;
 pub(crate) mod io_engine;
 pub use io_engine::{
    FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test,
@@ -121,7 +120,7 @@ impl VirtualFile {
    pub async fn open_with_options<P: AsRef<Utf8Path>>(
        path: P,
        open_options: &OpenOptions,
-        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> Result<Self, std::io::Error> {
        let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
        Ok(VirtualFile {
@@ -133,7 +132,7 @@ impl VirtualFile {
    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
        path: P,
        open_options: &OpenOptions,
-        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> Result<Self, std::io::Error> {
        let file = match get_io_mode() {
            IoMode::Buffered => {
@@ -300,13 +299,6 @@ pub struct VirtualFileInner {
    /// storing it here.
    pub path: Utf8PathBuf,
    open_options: OpenOptions,
-
-    // These are strings becase we only use them for metrics, and those expect strings.
-    // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
-    // strings.
-    tenant_id: String,
-    shard_id: String,
-    timeline_id: String,
 }

 #[derive(Debug, PartialEq, Clone, Copy)]
@@ -588,36 +580,16 @@ impl VirtualFileInner {
    pub async fn open_with_options<P: AsRef<Utf8Path>>(
        path: P,
        open_options: &OpenOptions,
-        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+        _ctx: &RequestContext,
    ) -> Result<VirtualFileInner, std::io::Error> {
-        let path_ref = path.as_ref();
-        let path_str = path_ref.to_string();
-        let parts = path_str.split('/').collect::<Vec<&str>>();
-        let (tenant_id, shard_id, timeline_id) =
-            if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
-                let tenant_shard_part = parts[parts.len() - 4];
-                let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
-                    Ok(tenant_shard_id) => (
-                        tenant_shard_id.tenant_id.to_string(),
-                        format!("{}", tenant_shard_id.shard_slug()),
-                    ),
-                    Err(_) => {
-                        // Malformed path: this ID is just for observability, so tolerate it
-                        // and pass through
-                        (tenant_shard_part.to_string(), "*".to_string())
-                    }
-                };
-                (tenant_id, shard_id, parts[parts.len() - 2].to_string())
-            } else {
-                ("*".to_string(), "*".to_string(), "*".to_string())
-            };
+        let path = path.as_ref();
        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;

        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
        // where our caller doesn't get to use the returned VirtualFile before its
        // slot gets re-used by someone else.
        let file = observe_duration!(StorageIoOperation::Open, {
-            open_options.open(path_ref.as_std_path()).await?
+            open_options.open(path.as_std_path()).await?
        });

        // Strip all options other than read and write.
@@ -633,11 +605,8 @@ impl VirtualFileInner {
        let vfile = VirtualFileInner {
            handle: RwLock::new(handle),
            pos: 0,
-            path: path_ref.to_path_buf(),
+            path: path.to_owned(),
            open_options: reopen_options,
-            tenant_id,
-            shard_id,
-            timeline_id,
        };

        // TODO: Under pressure, it's likely the slot will get re-used and
@@ -934,7 +903,7 @@ impl VirtualFileInner {
        &self,
        buf: tokio_epoll_uring::Slice<Buf>,
        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
    where
        Buf: tokio_epoll_uring::IoBufMut + Send,
@@ -952,14 +921,7 @@ impl VirtualFileInner {
            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
            let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
            if let Ok(size) = res {
-                STORAGE_IO_SIZE
-                    .with_label_values(&[
-                        "read",
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
-                    ])
-                    .add(size as i64);
+                ctx.io_size_metrics().read.add(size.into_u64());
            }
            (buf, res)
        })
@@ -970,9 +932,9 @@ impl VirtualFileInner {
        &self,
        buf: FullSlice<B>,
        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> (FullSlice<B>, Result<usize, Error>) {
-        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
+        let (slice, result) = self.write_at_inner(buf, offset, ctx).await;
        let result = result.maybe_fatal_err("write_at");
        (slice, result)
    }
@@ -981,7 +943,7 @@ impl VirtualFileInner {
        &self,
        buf: FullSlice<B>,
        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+        ctx: &RequestContext,
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
@@ -991,14 +953,7 @@ impl VirtualFileInner {
            let ((_file_guard, buf), result) =
                io_engine::get().write_at(file_guard, offset, buf).await;
            if let Ok(size) = result {
-                STORAGE_IO_SIZE
-                    .with_label_values(&[
-                        "write",
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
-                    ])
-                    .add(size as i64);
+                ctx.io_size_metrics().write.add(size.into_u64());
            }
            (buf, result)
        })
@@ -1584,7 +1539,8 @@ mod tests {
    where
        A: Adapter,
    {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
        std::fs::create_dir_all(&testdir)?;

@@ -1711,7 +1667,8 @@ mod tests {
        const THREADS: usize = 100;
        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];

-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
        std::fs::create_dir_all(&testdir)?;

@@ -1770,7 +1727,8 @@ mod tests {

    #[tokio::test]
    async fn test_atomic_overwrite_basic() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
        std::fs::create_dir_all(&testdir).unwrap();

@@ -1798,7 +1756,8 @@ mod tests {

    #[tokio::test]
    async fn test_atomic_overwrite_preexisting_tmp() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let ctx =
+            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let testdir =
            crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
        std::fs::create_dir_all(&testdir).unwrap();
--- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -181,7 +181,8 @@ where
        Err(self
            .shutdown()
            .await
-            .expect_err("flush task only disconnects duplex if it exits with an error"))
+            .err()
+            .expect("flush task only disconnects duplex if it exits with an error"))
    }

    /// Cleans up the channel, join the flush task.
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -136,7 +136,9 @@ impl WalRedoProcess {
                        Ok(0) => break Ok(()), // eof
                        Ok(num_bytes) => {
                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
+                            if !output.contains("LOG:") {
+                               error!(%output, "received output");
+                            }
                        }
                        Err(e) => {
                            break Err(e);