Merge branch 'main' into yuchen/direct-io-aligned-alloc

2026-06-02 21:10:38 +00:00 · 2024-09-24 21:29:33 -04:00
parent ff4a1db223 5f2f31e879
commit a2be8a440b
348 changed files with 15903 additions and 6502 deletions
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,14 +8,13 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
+testing = ["fail/failpoints", "pageserver_api/testing" ]

 [dependencies]
 anyhow.workspace = true
 arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
-async-trait.workspace = true
 bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
@@ -23,12 +22,9 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
-crossbeam-utils.workspace = true
 either.workspace = true
-flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -57,10 +53,6 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
-signal-hook.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
@@ -73,7 +65,6 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
-twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
@@ -101,6 +92,7 @@ procfs.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
+indoc.workspace = true

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,7 +4,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
+    config::PageServerConf,
    context::{DownloadBehavior, RequestContext},
    l0_flush::{L0FlushConfig, L0FlushGlobalState},
    page_cache,
@@ -167,7 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    virtual_file::init(
        16384,
        virtual_file::io_engine_for_bench(),
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
    );
    page_cache::init(conf.page_cache_size);

--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,7 +1,7 @@
 //! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
 //! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
+//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
 //! - `n_redos` => number of times the benchmark shell execute the `redo_work`
 //! - `nclients` => number of clients (more on this shortly).
 //!
@@ -10,7 +10,7 @@
 //! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
 //! We exercise the following combinations:
-//! - `redo_work = short / medium``
+//! - `redo_work = ping / short / medium``
 //! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
 //! We let `criterion` determine the `n_redos` using `iter_custom`.
@@ -27,33 +27,43 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-15 on i3en.3xlarge
+//! 2024-09-18 on im4gn.2xlarge
 //!
 //! ```text
-//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! ping/1                  time:   [21.789 µs 21.918 µs 22.078 µs]
+//! ping/2                  time:   [27.686 µs 27.812 µs 27.970 µs]
+//! ping/4                  time:   [35.468 µs 35.671 µs 35.926 µs]
+//! ping/8                  time:   [59.682 µs 59.987 µs 60.363 µs]
+//! ping/16                 time:   [101.79 µs 102.37 µs 103.08 µs]
+//! ping/32                 time:   [184.18 µs 185.15 µs 186.36 µs]
+//! ping/64                 time:   [349.86 µs 351.45 µs 353.47 µs]
+//! ping/128                time:   [684.53 µs 687.98 µs 692.17 µs]
+//! short/1                 time:   [31.833 µs 32.126 µs 32.428 µs]
+//! short/2                 time:   [35.558 µs 35.756 µs 35.992 µs]
+//! short/4                 time:   [44.850 µs 45.138 µs 45.484 µs]
+//! short/8                 time:   [65.985 µs 66.379 µs 66.853 µs]
+//! short/16                time:   [127.06 µs 127.90 µs 128.87 µs]
+//! short/32                time:   [252.98 µs 254.70 µs 256.73 µs]
+//! short/64                time:   [497.13 µs 499.86 µs 503.26 µs]
+//! short/128               time:   [987.46 µs 993.45 µs 1.0004 ms]
+//! medium/1                time:   [137.91 µs 138.55 µs 139.35 µs]
+//! medium/2                time:   [192.00 µs 192.91 µs 194.07 µs]
+//! medium/4                time:   [389.62 µs 391.55 µs 394.01 µs]
+//! medium/8                time:   [776.80 µs 780.33 µs 784.77 µs]
+//! medium/16               time:   [1.5323 ms 1.5383 ms 1.5459 ms]
+//! medium/32               time:   [3.0120 ms 3.0226 ms 3.0350 ms]
+//! medium/64               time:   [5.7405 ms 5.7787 ms 5.8166 ms]
+//! medium/128              time:   [10.412 ms 10.574 ms 10.718 ms]
 //! ```

 use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
+use once_cell::sync::Lazy;
 use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
+    future::Future,
    sync::Arc,
    time::{Duration, Instant},
 };
@@ -61,40 +71,59 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};

 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
+    macro_rules! bench_group {
+        ($name:expr, $redo_work:expr) => {{
+            let name: &str = $name;
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(name);
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
+                    },
+                );
+            }
+        }};
    }
+    //
+    // benchmark the protocol implementation
+    //
+    let pg_version = 14;
+    bench_group!(
+        "ping",
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let _: () = mgr.ping(pg_version).await.unwrap();
+        })
+    );
+    //
+    // benchmarks with actual record redo
+    //
+    let make_redo_work = |req: &'static Request| {
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let page = req.execute(&mgr).await.unwrap();
+            assert_eq!(page.remaining(), 8192);
+        })
+    };
+    bench_group!("short", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
+        make_redo_work(&REQUEST)
+    });
+    bench_group!("medium", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
+        make_redo_work(&REQUEST)
+    });
 }
 criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);

 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
@@ -135,17 +164,20 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
    })
 }

-async fn client(
+async fn client<F, Fut>(
    mgr: Arc<PostgresRedoManager>,
    start: Arc<Barrier>,
-    redo_work: Arc<Request>,
+    redo_work: Arc<F>,
    n_redos: u64,
-) -> Duration {
+) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
    start.wait().await;
    let start = Instant::now();
    for _ in 0..n_redos {
-        let page = redo_work.execute(&mgr).await.unwrap();
-        assert_eq!(page.remaining(), 8192);
+        redo_work(Arc::clone(&mgr)).await;
        // The real pageserver will rarely if ever do 2 walredos in a row without
        // yielding to the executor.
        tokio::task::yield_now().await;
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1,2 +1,20 @@
 pub mod mgmt_api;
 pub mod page_service;
+
+/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
+// If file structure is per-kind not per-feature then where to put this?
+#[derive(Clone, Copy)]
+pub enum BlockUnblock {
+    Block,
+    Unblock,
+}
+
+impl std::fmt::Display for BlockUnblock {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            BlockUnblock::Block => "block",
+            BlockUnblock::Unblock => "unblock",
+        };
+        f.write_str(s)
+    }
+}
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -12,6 +12,8 @@ use utils::{

 pub use reqwest::Body as ReqwestBody;

+use crate::BlockUnblock;
+
 pub mod util;

 #[derive(Debug, Clone)]
@@ -430,7 +432,7 @@ impl Client {
            self.mgmt_api_endpoint
        );

-        self.request(Method::POST, &uri, req)
+        self.request(Method::PUT, &uri, req)
            .await?
            .json()
            .await
@@ -454,6 +456,20 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    pub async fn timeline_block_unblock_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request(Method::POST, &uri, ()).await.map(|_| ())
+    }
+
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -9,41 +9,19 @@ default = []

 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 async-stream.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
 itertools.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pin-project-lite.workspace = true
 rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
 tracing.workspace = true
-tracing-error.workspace = true
 tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true

--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
@@ -24,5 +23,4 @@ toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
-serde.workspace = true
 serde_json.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,7 +4,6 @@

 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -80,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
        return None;
    }
    let keys: Vec<&str> = split[0].split('-').collect();
-    let mut lsns: Vec<&str> = split[1].split('-').collect();
-    let is_delta = if lsns.len() == 1 {
-        lsns.push(lsns[0]);
+    let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
+    let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
+    let the_lsns: [&str; 2];
+
+    /*
+     * Generations add a -vX-XXXXXX postfix, which causes issues when we try to
+     * parse 'vX' as an LSN.
+     */
+    let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
+        the_lsns = [lsns[0], lsns[0]];
        false
    } else {
+        the_lsns = [lsns[0], lsns[1]];
        true
    };

    let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
    let holes = Vec::new();
    Some(LayerFile {
        key_range,
@@ -148,7 +155,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    pageserver::virtual_file::init(
        10,
        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
    );
    pageserver::page_cache::init(100);

--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -194,7 +193,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            pageserver::virtual_file::init(
                10,
                virtual_file::api::IoEngineKind::StdFs,
-                DEFAULT_IO_BUFFER_ALIGNMENT,
+                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
            );
            pageserver::page_cache::init(100);

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,14 +20,13 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
-    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
    context::{DownloadBehavior, RequestContext},
    page_cache,
    task_mgr::TaskKind,
    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
    virtual_file,
 };
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
@@ -175,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
                println!("specified prefix '{}' failed validation", cmd.prefix);
                return Ok(());
            };
-            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
+            let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
            let toml_item = toml_document
                .get("remote_storage")
                .expect("need remote_storage");
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind};

 use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
-use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
+use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
-use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
@@ -255,8 +254,11 @@ where

        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;

+        let pgversion = self.timeline.pg_version;
+        let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);
+
        // Create pgdata subdirs structure
-        for dir in PGDATA_SUBDIRS.iter() {
+        for dir in subdirs.iter() {
            let header = new_tar_header_dir(dir)?;
            self.ar
                .append(&header, &mut io::empty())
@@ -606,7 +608,7 @@ where
    //
    // Extract twophase state files
    //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
+    async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> {
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
@@ -617,7 +619,11 @@ where
        buf.extend_from_slice(&img[..]);
        let crc = crc32c::crc32c(&img[..]);
        buf.put_u32_le(crc);
-        let path = format!("pg_twophase/{:>08X}", xid);
+        let path = if self.timeline.pg_version < 17 {
+            format!("pg_twophase/{:>08X}", xid)
+        } else {
+            format!("pg_twophase/{:>016X}", xid)
+        };
        let header = new_tar_header(&path, buf.len() as u64)?;
        self.ar
            .append(&header, &buf[..])
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -5,6 +5,7 @@
 use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

@@ -36,6 +37,7 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::crashsafe::syncfs;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::{
@@ -124,7 +126,6 @@ fn main() -> anyhow::Result<()> {
    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");

    // The tenants directory contains all the pageserver local disk state.
@@ -155,23 +156,7 @@ fn main() -> anyhow::Result<()> {
        };

        let started = Instant::now();
-        // Linux guarantees durability for syncfs.
-        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-        #[cfg(target_os = "linux")]
-        {
-            use std::os::fd::AsRawFd;
-            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
-        }
-        #[cfg(target_os = "macos")]
-        {
-            // macOS is not a production platform for Neon, don't even bother.
-            drop(dirfd);
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            compile_error!("Unsupported OS");
-        }
-
+        syncfs(dirfd)?;
        let elapsed = started.elapsed();
        info!(
            elapsed_ms = elapsed.as_millis(),
@@ -223,27 +208,15 @@ fn initialize_config(
        }
    };

-    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
-        Ok(mut f) => {
-            let md = f.metadata().context("stat config file")?;
-            if md.is_file() {
-                let mut s = String::new();
-                f.read_to_string(&mut s).context("read config file")?;
-                s.parse().context("parse config file toml")?
-            } else {
-                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
-            }
-        }
-        Err(e) => {
-            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
-        }
-    };
-
-    debug!("Using pageserver toml: {config}");
-
-    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
-        .context("Failed to parse pageserver configuration")?;
+    let config_file_contents =
+        std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
+    let config_toml = serde_path_to_error::deserialize(
+        toml_edit::de::Deserializer::from_str(&config_file_contents)
+            .context("build toml deserializer")?,
+    )
+    .context("deserialize config toml")?;
+    let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
+        .context("runtime-validation of config toml")?;

    Ok(Box::leak(Box::new(conf)))
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -178,7 +178,7 @@ async fn collect_metrics(
                )
                .await;
                if let Err(e) = res {
-                    tracing::error!("failed to upload to S3: {e:#}");
+                    tracing::error!("failed to upload to remote storage: {e:#}");
                }
            }
        };
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,7 +1,9 @@
-//! This module defines `RequestContext`, a structure that we use throughout
-//! the pageserver to propagate high-level context from places
-//! that _originate_ activity down to the shared code paths at the
-//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//! Defines [`RequestContext`].
+//!
+//! It is a structure that we use throughout the pageserver to propagate
+//! high-level context from places that _originate_ activity down to the
+//! shared code paths at the heart of the pageserver. It's inspired by
+//! Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,10 +141,24 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                        m.other
                    );

-                    let az_id = m
-                        .other
-                        .get("availability_zone_id")
-                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+                    let az_id = {
+                        let az_id_from_metadata = m
+                            .other
+                            .get("availability_zone_id")
+                            .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+
+                        match az_id_from_metadata {
+                            Some(az_id) => Some(az_id),
+                            None => {
+                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
+                                conf.availability_zone.clone()
+                            }
+                        }
+                    };
+
+                    if az_id.is_none() {
+                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
+                    }

                    Some(NodeRegisterRequest {
                        node_id: conf.id,
@@ -152,7 +166,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
-                        availability_zone_id: az_id,
+                        availability_zone_id: az_id.expect("Checked above"),
                    })
                }
                Err(e) => {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -41,19 +41,15 @@
 // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
 //   reading these fields. We use the Debug impl for semi-structured logging, though.

-use std::{
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::{sync::Arc, time::SystemTime};

 use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
 use remote_storage::GenericRemoteStorage;
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::serde_percent::Percent;
 use utils::{completion, id::TimelineId};

 use crate::{
@@ -69,23 +65,9 @@ use crate::{
    CancellableTask, DiskUsageEvictionTask,
 };

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct DiskUsageEvictionTaskConfig {
-    pub max_usage_pct: Percent,
-    pub min_avail_bytes: u64,
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[cfg(feature = "testing")]
-    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
-    /// Select sorting for evicted layers
-    #[serde(default)]
-    pub eviction_order: EvictionOrder,
-}
-
 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "type", content = "args")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum EvictionOrder {
    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
@@ -96,23 +78,22 @@ pub enum EvictionOrder {
        /// we read tenants is deterministic. If we find the need to use this as `false`, we need
        /// to ensure nondeterminism by adding in a random number to break the
        /// `relative_last_activity==0.0` ties.
-        #[serde(default = "default_highest_layer_count_loses_first")]
        highest_layer_count_loses_first: bool,
    },
 }

-impl Default for EvictionOrder {
-    fn default() -> Self {
-        Self::RelativeAccessed {
-            highest_layer_count_loses_first: true,
+impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
+    fn from(value: pageserver_api::config::EvictionOrder) -> Self {
+        match value {
+            pageserver_api::config::EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first,
+            } => Self::RelativeAccessed {
+                highest_layer_count_loses_first,
+            },
        }
    }
 }

-fn default_highest_layer_count_loses_first() -> bool {
-    true
-}
-
 impl EvictionOrder {
    fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
        use EvictionOrder::*;
@@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration(
        storage,
        usage_pre,
        tenant_manager,
-        task_config.eviction_order,
+        task_config.eviction_order.into(),
        cancel,
    )
    .await;
@@ -1257,7 +1238,6 @@ mod filesystem_level_usage {

    #[test]
    fn max_usage_pct_pressure() {
-        use super::EvictionOrder;
        use super::Usage as _;
        use std::time::Duration;
        use utils::serde_percent::Percent;
@@ -1269,7 +1249,7 @@ mod filesystem_level_usage {
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: EvictionOrder::default(),
+                eviction_order: pageserver_api::config::EvictionOrder::default(),
            },
            total_bytes: 100_000,
            avail_bytes: 0,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -468,7 +468,7 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
-        is_archived,
+        is_archived: Some(is_archived),

        walreceiver_status,

@@ -2076,7 +2076,7 @@ async fn disk_usage_eviction_run(
        evict_bytes: u64,

        #[serde(default)]
-        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
+        eviction_order: pageserver_api::config::EvictionOrder,
    }

    #[derive(Debug, Clone, Copy, serde::Serialize)]
@@ -2112,7 +2112,7 @@ async fn disk_usage_eviction_run(
        &state.remote_storage,
        usage,
        &state.tenant_manager,
-        config.eviction_order,
+        config.eviction_order.into(),
        &cancel,
    )
    .await;
@@ -2955,7 +2955,7 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
        )
-        .post(
+        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
            |r| api_handler(r, timeline_archival_config_handler),
        )
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
+use crate::walrecord::decode_wal_record;
 use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
@@ -310,11 +311,13 @@ async fn import_wal(

        let mut nrecords = 0;
        let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+
                walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                    .await?;
                WAL_INGEST.records_committed.inc();

@@ -449,11 +452,12 @@ pub async fn import_wal_from_tar(
        waldecoder.feed_bytes(&bytes[offset..]);

        let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
                walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                    .await?;
                modification.commit(ctx).await?;
                last_lsn = lsn;
@@ -576,9 +580,11 @@ async fn import_file(
        import_slru(modification, slru, file_path, reader, len, ctx).await?;
        debug!("imported multixact members slru");
    } else if file_path.starts_with("pg_twophase") {
-        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
-
        let bytes = read_all_bytes(reader).await?;
+
+        // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions,
+        // it's a 32-bit TransactionId, which fits in u64 anyway.
+        let xid = u64::from_str_radix(file_name.as_ref(), 16)?;
        modification
            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
            .await?;
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,9 +1,7 @@
 use std::{num::NonZeroUsize, sync::Arc};

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub enum L0FlushConfig {
-    #[serde(rename_all = "snake_case")]
    Direct { max_concurrency: NonZeroUsize },
 }

@@ -16,6 +14,16 @@ impl Default for L0FlushConfig {
    }
 }

+impl From<pageserver_api::models::L0FlushConfig> for L0FlushConfig {
+    fn from(config: pageserver_api::models::L0FlushConfig) -> Self {
+        match config {
+            pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => {
+                Self::Direct { max_concurrency }
+            }
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,7 +9,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, VariantNames};
-use strum_macros::{EnumVariantNames, IntoStaticStr};
+use strum_macros::{IntoStaticStr, VariantNames};
 use tracing::warn;
 use utils::id::TimelineId;

@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];

 // Metrics collected on operations on the storage repository.
-#[derive(Debug, EnumVariantNames, IntoStaticStr)]
+#[derive(Debug, VariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
 }

 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_metric: &'a Histogram,
+    global_latency_histo: &'a Histogram,

    // Optional because not all op types are tracked per-timeline
-    timeline_metric: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<&'a Histogram>,

    ctx: &'c RequestContext,
    start: std::time::Instant,
@@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        self.global_metric.observe(ex_throttled.as_secs_f64());
-        if let Some(timeline_metric) = self.timeline_metric {
-            timeline_metric.observe(ex_throttled.as_secs_f64());
+        self.global_latency_histo
+            .observe(ex_throttled.as_secs_f64());
+        if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
        }
    }
 }
@@ -1240,10 +1241,32 @@ pub enum SmgrQueryType {

 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    global_metrics: [Histogram; SmgrQueryType::COUNT],
-    per_timeline_getpage: Histogram,
+    global_started: [IntCounter; SmgrQueryType::COUNT],
+    global_latency: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage_started: IntCounter,
+    per_timeline_getpage_latency: Histogram,
 }

+static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_global_count",
+        "Number of smgr queries started, aggregated by query type.",
+        &["smgr_query_type"],
+    )
+    .expect("failed to define a metric")
+});
+
+static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_count",
+        "Number of smgr queries started, aggregated by query type and tenant/timeline.",
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
@@ -1319,14 +1342,20 @@ impl SmgrQueryTimePerTimeline {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let global_metrics = std::array::from_fn(|i| {
+        let global_started = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            SMGR_QUERY_STARTED_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap()
+        });
+        let global_latency = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
            SMGR_QUERY_TIME_GLOBAL
                .get_metric_with_label_values(&[op.into()])
                .unwrap()
        });

-        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+        let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
            .get_metric_with_label_values(&[
                SmgrQueryType::GetPageAtLsn.into(),
                &tenant_id,
@@ -1334,18 +1363,32 @@ impl SmgrQueryTimePerTimeline {
                &timeline_id,
            ])
            .unwrap();
+        let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+
        Self {
-            global_metrics,
-            per_timeline_getpage,
+            global_started,
+            global_latency,
+            per_timeline_getpage_latency,
+            per_timeline_getpage_started,
        }
    }
    pub(crate) fn start_timer<'c: 'a, 'a>(
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + '_> {
-        let global_metric = &self.global_metrics[op as usize];
+    ) -> Option<impl Drop + 'a> {
        let start = Instant::now();
+
+        self.global_started[op as usize].inc();
+
+        // We subtract time spent throttled from the observed latency.
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
            Err(error) => {
@@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline {
            }
        }

-        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
-            Some(&self.per_timeline_getpage)
+        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            self.per_timeline_getpage_started.inc();
+            Some(&self.per_timeline_getpage_latency)
        } else {
            None
        };

        Some(GlobalAndPerTimelineHistogramTimer {
-            global_metric,
-            timeline_metric,
+            global_latency_histo: &self.global_latency[op as usize],
+            per_timeline_latency_histo,
            ctx,
            start,
            op,
@@ -1423,9 +1467,12 @@ mod smgr_query_time_tests {
            let get_counts = || {
                let global: u64 = ops
                    .iter()
-                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
+                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
                    .sum();
-                (global, metrics.per_timeline_getpage.get_sample_count())
+                (
+                    global,
+                    metrics.per_timeline_getpage_latency.get_sample_count(),
+                )
            };

            let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1487,7 +1534,7 @@ impl BasebackupQueryTime {
    pub(crate) fn start_recording<'c: 'a, 'a>(
        &'a self,
        ctx: &'c RequestContext,
-    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+    ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
        let start = Instant::now();
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
@@ -1777,7 +1824,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
    .expect("failed to define a metric"),
    upload_heatmap_duration: register_histogram!(
        "pageserver_secondary_upload_heatmap_duration",
-        "Time to build and upload a heatmap, including any waiting inside the S3 client"
+        "Time to build and upload a heatmap, including any waiting inside the remote storage client"
    )
    .expect("failed to define a metric"),
    download_heatmap: register_int_counter!(
@@ -2576,6 +2623,12 @@ impl TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

+        let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
            SmgrQueryType::GetPageAtLsn.into(),
            tenant_id,
@@ -2592,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }

+    tenant_throttling::remove_tenant_metrics(tenant_shard_id);
+
    // we leave the BROKEN_TENANTS_SET entry if any
 }

@@ -3055,41 +3110,173 @@ pub mod tokio_epoll_uring {
 pub(crate) mod tenant_throttling {
    use metrics::{register_int_counter_vec, IntCounter};
    use once_cell::sync::Lazy;
+    use utils::shard::TenantShardId;

    use crate::tenant::{self, throttle::Metric};

-    pub(crate) struct TimelineGet {
-        wait_time: IntCounter,
-        count: IntCounter,
+    struct GlobalAndPerTenantIntCounter {
+        global: IntCounter,
+        per_tenant: IntCounter,
    }

-    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
-        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-            "pageserver_tenant_throttling_wait_usecs_sum_global",
-            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+    impl GlobalAndPerTenantIntCounter {
+        #[inline(always)]
+        pub(crate) fn inc(&self) {
+            self.inc_by(1)
+        }
+        #[inline(always)]
+        pub(crate) fn inc_by(&self, n: u64) {
+            self.global.inc_by(n);
+            self.per_tenant.inc_by(n);
+        }
+    }
+
+    pub(crate) struct TimelineGet {
+        count_accounted_start: GlobalAndPerTenantIntCounter,
+        count_accounted_finish: GlobalAndPerTenantIntCounter,
+        wait_time: GlobalAndPerTenantIntCounter,
+        count_throttled: GlobalAndPerTenantIntCounter,
+    }
+
+    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start_global",
+            "Count of tenant throttling starts, by kind of throttle.",
            &["kind"]
        )
-            .unwrap()
-        });
-
-        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-                "pageserver_tenant_throttling_count_global",
-                "Count of tenant throttlings, by kind of throttle.",
-                &["kind"]
-            )
-            .unwrap()
-        });
-
-        let kind = "timeline_get";
-        TimelineGet {
-            wait_time: WAIT_USECS.with_label_values(&[kind]),
-            count: WAIT_COUNT.with_label_values(&[kind]),
-        }
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start",
+            "Count of tenant throttling starts, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish_global",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum_global",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
    });

-    impl Metric for &'static TimelineGet {
+    static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_global",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+
+    const KIND: &str = "timeline_get";
+
+    impl TimelineGet {
+        pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            let per_tenant_label_values = &[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ];
+            TimelineGet {
+                count_accounted_start: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
+                    }
+                },
+                count_accounted_finish: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
+                    }
+                },
+                wait_time: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        per_tenant: WAIT_USECS_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
+                    }
+                },
+                count_throttled: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        per_tenant: WAIT_COUNT_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
+                    }
+                },
+            }
+        }
+    }
+
+    pub(crate) fn preinitialize_global_metrics() {
+        Lazy::force(&COUNT_ACCOUNTED_START);
+        Lazy::force(&COUNT_ACCOUNTED_FINISH);
+        Lazy::force(&WAIT_USECS);
+        Lazy::force(&WAIT_COUNT);
+    }
+
+    pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+        for m in &[
+            &COUNT_ACCOUNTED_START_PER_TENANT,
+            &COUNT_ACCOUNTED_FINISH_PER_TENANT,
+            &WAIT_USECS_PER_TENANT,
+            &WAIT_COUNT_PER_TENANT,
+        ] {
+            let _ = m.remove_label_values(&[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ]);
+        }
+    }
+
+    impl Metric for TimelineGet {
+        #[inline(always)]
+        fn accounting_start(&self) {
+            self.count_accounted_start.inc();
+        }
+        #[inline(always)]
+        fn accounting_finish(&self) {
+            self.count_accounted_finish.inc();
+        }
        #[inline(always)]
        fn observe_throttling(
            &self,
@@ -3097,7 +3284,7 @@ pub(crate) mod tenant_throttling {
        ) {
            let val = u64::try_from(wait_time.as_micros()).unwrap();
            self.wait_time.inc_by(val);
-            self.count.inc();
+            self.count_throttled.inc();
        }
    }
 }
@@ -3227,11 +3414,14 @@ pub fn preinitialize_metrics() {
    }

    // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
+    [
+        &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
+        &SMGR_QUERY_STARTED_GLOBAL,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });

    // gauges
    WALRECEIVER_ACTIVE_MANAGERS.get();
@@ -3253,7 +3443,8 @@ pub fn preinitialize_metrics() {

    // Custom
    Lazy::force(&RECONSTRUCT_TIME);
-    Lazy::force(&tenant_throttling::TIMELINE_GET);
    Lazy::force(&BASEBACKUP_QUERY_TIME);
    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+
+    tenant_throttling::preinitialize_global_metrics();
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,6 @@ impl PageServerHandler {
    }
 }

-#[async_trait::async_trait]
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
-use anyhow::{bail, ensure, Context};
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use pageserver_api::key::{
@@ -168,7 +168,9 @@ impl Timeline {
        DatadirModification {
            tline: self,
            pending_lsns: Vec::new(),
-            pending_updates: HashMap::new(),
+            pending_metadata_pages: HashMap::new(),
+            pending_data_pages: Vec::new(),
+            pending_zero_data_pages: Default::default(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
@@ -631,7 +633,7 @@ impl Timeline {

    pub(crate) async fn get_twophase_file(
        &self,
-        xid: TransactionId,
+        xid: u64,
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
@@ -644,11 +646,19 @@ impl Timeline {
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> Result<HashSet<TransactionId>, PageReconstructError> {
+    ) -> Result<HashSet<u64>, PageReconstructError> {
        // fetch directory entry
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;

-        Ok(TwoPhaseDirectory::des(&buf)?.xids)
+        if self.pg_version >= 17 {
+            Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
+        } else {
+            Ok(TwoPhaseDirectory::des(&buf)?
+                .xids
+                .iter()
+                .map(|x| u64::from(*x))
+                .collect())
+        }
    }

    pub(crate) async fn get_control_file(
@@ -727,8 +737,12 @@ impl Timeline {
        let current_policy = self.last_aux_file_policy.load();
        match current_policy {
            Some(AuxFilePolicy::V1) => {
-                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
-                self.list_aux_files_v1(lsn, ctx).await
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                let empty_str = if res.is_empty() { ", empty" } else { "" };
+                warn!(
+                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
+                );
+                Ok(res)
            }
            None => {
                let res = self.list_aux_files_v1(lsn, ctx).await?;
@@ -826,6 +840,36 @@ impl Timeline {
        Ok(total_size * BLCKSZ as u64)
    }

+    /// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
+    /// for gc-compaction.
+    ///
+    /// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
+    /// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
+    /// be kept only for a specific range of LSN.
+    ///
+    /// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
+    /// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
+    /// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
+    /// determine which keys to retain/drop for gc-compaction.
+    ///
+    /// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
+    /// to be retained for each of the branch LSN.
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
+    pub(crate) async fn collect_gc_compaction_keyspace(
+        &self,
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+        let metadata_key_begin = Key::metadata_key_range().start;
+        let aux_v1_key = AUX_FILES_KEY;
+        let dense_keyspace = KeySpace {
+            ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
+        };
+        Ok((
+            dense_keyspace,
+            SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
+        ))
+    }
+
    ///
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
@@ -896,9 +940,13 @@ impl Timeline {

        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
-        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
+
+        let mut xids: Vec<u64> = self
+            .list_twophase_files(lsn, ctx)
+            .await?
+            .iter()
+            .cloned()
+            .collect();
        xids.sort_unstable();
        for xid in xids {
            result.add_key(twophase_file_key(xid));
@@ -1015,9 +1063,10 @@ impl Timeline {
 }

 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository. It is created by the 'begin_record'
-/// function. It is called for each WAL record, so that all the modifications
-/// by a one WAL record appear atomic.
+/// updates to the repository.
+///
+/// It is created by the 'begin_record' function. It is called for each WAL
+/// record, so that all the modifications by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
    /// The timeline this modification applies to. You can access this to
    /// read the state, but note that any pending updates are *not* reflected
@@ -1031,10 +1080,24 @@ pub struct DatadirModification<'a> {
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

+    /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
+    /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
+    pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
+
+    /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
+    /// which keys are stored here.
+    pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
+
+    // Sometimes during ingest, for example when extending a relation, we would like to write a zero page.  However,
+    // if we encounter a write from postgres in the same wal record, we will drop this entry.
+    //
+    // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
+    // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
+    pending_zero_data_pages: HashSet<CompactKey>,
+
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1058,6 +1121,10 @@ impl<'a> DatadirModification<'a> {
        self.pending_bytes
    }

+    pub(crate) fn has_dirty_data_pages(&self) -> bool {
+        (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
+    }
+
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1066,6 +1133,10 @@ impl<'a> DatadirModification<'a> {
            lsn,
            self.lsn
        );
+
+        // If we are advancing LSN, then state from previous wal record should have been flushed.
+        assert!(self.pending_zero_data_pages.is_empty());
+
        if lsn > self.lsn {
            self.pending_lsns.push(self.lsn);
            self.lsn = lsn;
@@ -1073,6 +1144,17 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
+    /// keys that represent literal blocks that postgres can read.  So data includes relation blocks and
+    /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
+    ///
+    /// The distinction is important because data keys are handled on a fast path where dirty writes are
+    /// not readable until this modification is committed, whereas metadata keys are visible for read
+    /// via [`Self::get`] as soon as their record has been ingested.
+    fn is_data_key(key: &Key) -> bool {
+        key.is_rel_block_key() || key.is_slru_block_key()
+    }
+
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -1087,9 +1169,15 @@ impl<'a> DatadirModification<'a> {
        // Create AuxFilesDirectory
        self.init_aux_dir()?;

-        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
-            xids: HashSet::new(),
-        })?;
+        let buf = if self.tline.pg_version >= 17 {
+            TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
+                xids: HashSet::new(),
+            })
+        } else {
+            TwoPhaseDirectory::ser(&TwoPhaseDirectory {
+                xids: HashSet::new(),
+            })
+        }?;
        self.pending_directory_entries
            .push((DirectoryKind::TwoPhase, 0));
        self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
@@ -1165,6 +1253,13 @@ impl<'a> DatadirModification<'a> {
        img: Bytes,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -1176,10 +1271,63 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> anyhow::Result<()> {
-        self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
+        self.put(key, Value::Image(img));
        Ok(())
    }

+    pub(crate) fn put_rel_page_image_zero(
+        &mut self,
+        rel: RelTag,
+        blknum: BlockNumber,
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
+    }
+
+    pub(crate) fn put_slru_page_image_zero(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        blknum: BlockNumber,
+    ) -> anyhow::Result<()> {
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
+    }
+
+    /// Call this at the end of each WAL record.
+    pub(crate) fn on_record_end(&mut self) {
+        let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
+        for key in pending_zero_data_pages {
+            self.put_data(key, Value::Image(ZERO_PAGE.clone()));
+        }
+    }
+
    /// Store a relmapper file (pg_filenode.map) in the repository
    pub async fn put_relmap_file(
        &mut self,
@@ -1221,22 +1369,31 @@ impl<'a> DatadirModification<'a> {

    pub async fn put_twophase_file(
        &mut self,
-        xid: TransactionId,
+        xid: u64,
        img: Bytes,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let mut dir = TwoPhaseDirectory::des(&buf)?;
-        if !dir.xids.insert(xid) {
-            anyhow::bail!("twophase file for xid {} already exists", xid);
-        }
-        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, dir.xids.len()));
-        self.put(
-            TWOPHASEDIR_KEY,
-            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
-        );
+        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
+        let newdirbuf = if self.tline.pg_version >= 17 {
+            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
+            if !dir.xids.insert(xid) {
+                anyhow::bail!("twophase file for xid {} already exists", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
+        } else {
+            let xid = xid as u32;
+            let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
+            if !dir.xids.insert(xid) {
+                anyhow::bail!("twophase file for xid {} already exists", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
+        };
+        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));

        self.put(twophase_file_key(xid), Value::Image(img));
        Ok(())
@@ -1539,22 +1696,32 @@ impl<'a> DatadirModification<'a> {
    /// This method is used for marking truncated SLRU files
    pub async fn drop_twophase_file(
        &mut self,
-        xid: TransactionId,
+        xid: u64,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Remove it from the directory entry
        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let mut dir = TwoPhaseDirectory::des(&buf)?;
+        let newdirbuf = if self.tline.pg_version >= 17 {
+            let mut dir = TwoPhaseDirectoryV17::des(&buf)?;

-        if !dir.xids.remove(&xid) {
-            warn!("twophase file for xid {} does not exist", xid);
-        }
-        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, dir.xids.len()));
-        self.put(
-            TWOPHASEDIR_KEY,
-            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
-        );
+            if !dir.xids.remove(&xid) {
+                warn!("twophase file for xid {} does not exist", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
+        } else {
+            let xid: u32 = u32::try_from(xid)?;
+            let mut dir = TwoPhaseDirectory::des(&buf)?;
+
+            if !dir.xids.remove(&xid) {
+                warn!("twophase file for xid {} does not exist", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
+        };
+        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));

        // Delete it
        self.delete(twophase_key_range(xid));
@@ -1597,7 +1764,7 @@ impl<'a> DatadirModification<'a> {
                if aux_files_key_v1.is_empty() {
                    None
                } else {
-                    warn!("this timeline is using deprecated aux file policy V1");
+                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                    Some(AuxFilePolicy::V1)
                }
@@ -1778,7 +1945,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1789,31 +1956,11 @@ impl<'a> DatadirModification<'a> {
        let mut writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
-        for (key, values) in self.pending_updates.drain() {
-            if !key.is_valid_key_on_write_path() {
-                bail!(
-                    "the request contains data not supported by pageserver at TimelineWriter::put: {}", key
-                );
-            }
-            let mut write_batch = Vec::new();
-            for (lsn, value_ser_size, value) in values {
-                if key.is_rel_block_key() || key.is_slru_block_key() {
-                    // This bails out on first error without modifying pending_updates.
-                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
-                } else {
-                    retained_pending_updates.entry(key).or_default().push((
-                        lsn,
-                        value_ser_size,
-                        value,
-                    ));
-                }
-            }
-            writer.put_batch(write_batch, ctx).await?;
-        }
+        let pending_data_pages = std::mem::take(&mut self.pending_data_pages);

-        self.pending_updates = retained_pending_updates;
+        // This bails out on first error without modifying pending_updates.
+        // That's Ok, cf this function's doc comment.
+        writer.put_batch(pending_data_pages, ctx).await?;
        self.pending_bytes = 0;

        if pending_nblocks != 0 {
@@ -1834,29 +1981,31 @@ impl<'a> DatadirModification<'a> {
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+        // Commit should never be called mid-wal-record
+        assert!(self.pending_zero_data_pages.is_empty());
+
        let mut writer = self.tline.writer().await;

        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

-        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
-                .pending_updates
+        // Ordering: the items in this batch do not need to be in any global order, but values for
+        // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+        // this to do efficient updates to its index.
+        let mut write_batch = std::mem::take(&mut self.pending_data_pages);
+
+        write_batch.extend(
+            self.pending_metadata_pages
                .drain()
                .flat_map(|(key, values)| {
-                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        if !key.is_valid_key_on_write_path() {
-                            bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
-                        }
-                        Ok((key.to_compact(), lsn, val_ser_size, value))
-                    })
-                })
-                .collect::<anyhow::Result<Vec<_>>>()?;
+                    values
+                        .into_iter()
+                        .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
+                }),
+        );

-            writer.put_batch(batch, ctx).await?;
+        if !write_batch.is_empty() {
+            writer.put_batch(write_batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1887,33 +2036,58 @@ impl<'a> DatadirModification<'a> {
    }

    pub(crate) fn len(&self) -> usize {
-        self.pending_updates.len() + self.pending_deletions.len()
+        self.pending_metadata_pages.len()
+            + self.pending_data_pages.len()
+            + self.pending_deletions.len()
    }

-    // Internal helper functions to batch the modifications
-
+    /// Read a page from the Timeline we are writing to.  For metadata pages, this passes through
+    /// a cache in Self, which makes writes earlier in this modification visible to WAL records later
+    /// in the modification.
+    ///
+    /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
+    /// page must ensure that the pages they read are already committed in Timeline, for example
+    /// DB create operations are always preceded by a call to commit().  This is special cased because
+    /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
+    /// and not data pages.
    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the latest pending updated
-        // version in that case.
-        //
-        // Note: we don't check pending_deletions. It is an error to request a
-        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, _, value)) = values.last() {
-                return if let Value::Image(img) = value {
-                    Ok(img.clone())
-                } else {
-                    // Currently, we never need to read back a WAL record that we
-                    // inserted in the same "transaction". All the metadata updates
-                    // work directly with Images, and we never need to read actual
-                    // data pages. We could handle this if we had to, by calling
-                    // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::Other(anyhow::anyhow!(
-                        "unexpected pending WAL record"
-                    )))
-                };
+        if !Self::is_data_key(&key) {
+            // Have we already updated the same key? Read the latest pending updated
+            // version in that case.
+            //
+            // Note: we don't check pending_deletions. It is an error to request a
+            // value that has been removed, deletion only avoids leaking storage.
+            if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
+                if let Some((_, _, value)) = values.last() {
+                    return if let Value::Image(img) = value {
+                        Ok(img.clone())
+                    } else {
+                        // Currently, we never need to read back a WAL record that we
+                        // inserted in the same "transaction". All the metadata updates
+                        // work directly with Images, and we never need to read actual
+                        // data pages. We could handle this if we had to, by calling
+                        // the walredo manager, but let's keep it simple for now.
+                        Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "unexpected pending WAL record"
+                        )))
+                    };
+                }
+            }
+        } else {
+            // This is an expensive check, so we only do it in debug mode. If reading a data key,
+            // this key should never be present in pending_data_pages. We ensure this by committing
+            // modifications before ingesting DB create operations, which are the only kind that reads
+            // data pages during ingest.
+            if cfg!(debug_assertions) {
+                for (dirty_key, _, _, _) in &self.pending_data_pages {
+                    debug_assert!(&key.to_compact() != dirty_key);
+                }
+
+                debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
            }
        }
+
+        // Metadata page cache miss, or we're reading a data page.
        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
        self.tline.get(key, lsn, ctx).await
    }
@@ -1925,11 +2099,40 @@ impl<'a> DatadirModification<'a> {
    }

    fn put(&mut self, key: Key, val: Value) {
-        let values = self.pending_updates.entry(key).or_default();
+        if Self::is_data_key(&key) {
+            self.put_data(key.to_compact(), val)
+        } else {
+            self.put_metadata(key.to_compact(), val)
+        }
+    }
+
+    fn put_data(&mut self, key: CompactKey, val: Value) {
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+
+        // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write.  This
+        // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
+        // and the subsequent postgres-originating write
+        if self.pending_zero_data_pages.remove(&key) {
+            self.pending_bytes -= ZERO_PAGE.len();
+        }
+
+        self.pending_bytes += val_serialized_size;
+        self.pending_data_pages
+            .push((key, self.lsn, val_serialized_size, val))
+    }
+
+    fn put_metadata(&mut self, key: CompactKey, val: Value) {
+        let values = self.pending_metadata_pages.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
+                // Update the pending_bytes contribution from this entry, and update the serialized size in place
+                self.pending_bytes -= *last_value_ser_size;
                *last_value_ser_size = val.serialized_size().unwrap() as usize;
+                self.pending_bytes += *last_value_ser_size;
+
+                // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
+                // have been generated by synthesized zero page writes prior to the first real write to a page.
                *last_value = val;
                return;
            }
@@ -1948,6 +2151,7 @@ impl<'a> DatadirModification<'a> {

 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
+///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
@@ -1987,11 +2191,21 @@ struct DbDirectory {
    dbdirs: HashMap<(Oid, Oid), bool>,
 }

+// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
+// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs.  Previously, the files
+// were named like "pg_twophase/000002E5", now they're like
+// "pg_twophsae/0000000A000002E4".
+
 #[derive(Debug, Serialize, Deserialize)]
 struct TwoPhaseDirectory {
    xids: HashSet<TransactionId>,
 }

+#[derive(Debug, Serialize, Deserialize)]
+struct TwoPhaseDirectoryV17 {
+    xids: HashSet<u64>,
+}
+
 #[derive(Debug, Serialize, Deserialize, Default)]
 struct RelDirectory {
    // Set of relations that exist. (relfilenode, forknum)
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -60,32 +60,7 @@ pub mod mock {
    use regex::Regex;
    use tracing::log::info;

-    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[serde(tag = "type")]
-    pub enum Behavior {
-        Success {
-            blocksize: u64,
-            total_blocks: u64,
-            name_filter: Option<utils::serde_regex::Regex>,
-        },
-        Failure {
-            mocked_error: MockedError,
-        },
-    }
-
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[allow(clippy::upper_case_acronyms)]
-    pub enum MockedError {
-        EIO,
-    }
-
-    impl From<MockedError> for nix::Error {
-        fn from(e: MockedError) -> Self {
-            match e {
-                MockedError::EIO => nix::Error::EIO,
-            }
-        }
-    }
+    pub use pageserver_api::config::statvfs::mock::Behavior;

    pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
        info!("running mocked statvfs");
@@ -116,6 +91,7 @@ pub mod mock {
                    block_size: *blocksize,
                })
            }
+            #[cfg(feature = "testing")]
            Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
        }
    }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,8 +1,9 @@
+//! Timeline repository implementation that keeps old data in layer files, and
+//! the recent changes in ephemeral files.
 //!
-//! Timeline repository implementation that keeps old data in files on disk, and
-//! the recent changes in memory. See tenant/*_layer.rs files.
-//! The functions here are responsible for locating the correct layer for the
-//! get/put call, walking back the timeline branching history as needed.
+//! See tenant/*_layer.rs files. The functions here are responsible for locating
+//! the correct layer for the get/put call, walking back the timeline branching
+//! history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
@@ -17,7 +18,6 @@ use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
-use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
@@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
+use std::future::Future;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -139,6 +140,7 @@ pub mod metadata;
 pub mod remote_timeline_client;
 pub mod storage_layer;

+pub mod checks;
 pub mod config;
 pub mod mgr;
 pub mod secondary;
@@ -300,7 +302,7 @@ pub struct Tenant {
    /// Throttle applied at the top of [`Timeline::get`].
    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,

    /// An ongoing timeline detach concurrency limiter.
    ///
@@ -1029,13 +1031,9 @@ impl Tenant {
        }

        Ok(TenantPreload {
-            timelines: Self::load_timeline_metadata(
-                self,
-                remote_timeline_ids,
-                remote_storage,
-                cancel,
-            )
-            .await?,
+            timelines: self
+                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+                .await?,
        })
    }

@@ -1301,7 +1299,7 @@ impl Tenant {
        .await
    }

-    async fn load_timeline_metadata(
+    async fn load_timelines_metadata(
        self: &Arc<Tenant>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
@@ -1309,33 +1307,10 @@ impl Tenant {
    ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
        let mut part_downloads = JoinSet::new();
        for timeline_id in timeline_ids {
-            let client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
            let cancel_clone = cancel.clone();
            part_downloads.spawn(
-                async move {
-                    debug!("starting index part download");
-
-                    let index_part = client.download_index_file(&cancel_clone).await;
-
-                    debug!("finished index part download");
-
-                    Result::<_, anyhow::Error>::Ok(TimelinePreload {
-                        client,
-                        timeline_id,
-                        index_part,
-                    })
-                }
-                .map(move |res| {
-                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
-                })
-                .instrument(info_span!("download_index_part", %timeline_id)),
+                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
+                    .instrument(info_span!("download_index_part", %timeline_id)),
            );
        }

@@ -1346,8 +1321,7 @@ impl Tenant {
                next = part_downloads.join_next() => {
                    match next {
                        Some(result) => {
-                            let preload_result = result.context("join preload task")?;
-                            let preload = preload_result?;
+                            let preload = result.context("join preload task")?;
                            timeline_preloads.insert(preload.timeline_id, preload);
                        },
                        None => {
@@ -1364,6 +1338,36 @@ impl Tenant {
        Ok(timeline_preloads)
    }

+    fn load_timeline_metadata(
+        self: &Arc<Tenant>,
+        timeline_id: TimelineId,
+        remote_storage: GenericRemoteStorage,
+        cancel: CancellationToken,
+    ) -> impl Future<Output = TimelinePreload> {
+        let client = RemoteTimelineClient::new(
+            remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
+        async move {
+            debug_assert_current_span_has_tenant_and_timeline_id();
+            debug!("starting index part download");
+
+            let index_part = client.download_index_file(&cancel).await;
+
+            debug!("finished index part download");
+
+            TimelinePreload {
+                client,
+                timeline_id,
+                index_part,
+            }
+        }
+    }
+
    pub(crate) async fn apply_timeline_archival_config(
        &self,
        timeline_id: TimelineId,
@@ -1572,6 +1576,9 @@ impl Tenant {
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
        let tline = self
            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
            .await?;
@@ -1586,6 +1593,18 @@ impl Tenant {
                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                .await?;
        }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
        Ok(tline)
    }

@@ -1949,9 +1968,6 @@ impl Tenant {
                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                }
-                TenantState::Loading => {
-                    *current_state = TenantState::Activating(ActivatingFrom::Loading);
-                }
                TenantState::Attaching => {
                    *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                }
@@ -2132,7 +2148,7 @@ impl Tenant {
    async fn set_stopping(
        &self,
        progress: completion::Barrier,
-        allow_transition_from_loading: bool,
+        _allow_transition_from_loading: bool,
        allow_transition_from_attaching: bool,
    ) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();
@@ -2147,7 +2163,6 @@ impl Tenant {
                );
                false
            }
-            TenantState::Loading => allow_transition_from_loading,
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
@@ -2166,13 +2181,6 @@ impl Tenant {
                *current_state = TenantState::Stopping { progress };
                true
            }
-            TenantState::Loading => {
-                if !allow_transition_from_loading {
-                    unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
-            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -2228,7 +2236,7 @@ impl Tenant {
        // The load & attach routines own the tenant state until it has reached `Active`.
        // So, wait until it's done.
        rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
@@ -2248,7 +2256,7 @@ impl Tenant {
        let reason = reason.to_string();
        self.state.send_modify(|current_state| {
            match *current_state {
-                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                TenantState::Activating(_) | TenantState::Attaching => {
                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
                }
                TenantState::Active => {
@@ -2292,7 +2300,7 @@ impl Tenant {
        loop {
            let current_state = receiver.borrow_and_update().clone();
            match current_state {
-                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
+                TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
                    self.activate_now();
                    match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
@@ -2814,7 +2822,7 @@ impl Tenant {
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                &crate::metrics::tenant_throttling::TIMELINE_GET,
+                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
@@ -3196,6 +3204,9 @@ impl Tenant {
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
        let tline = self
            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
            .await?;
@@ -3216,6 +3227,18 @@ impl Tenant {
                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
                .await?;
        }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
        Ok(tline)
    }

@@ -3593,7 +3616,7 @@ impl Tenant {
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<UninitializedTimeline> {
+    ) -> anyhow::Result<UninitializedTimeline<'a>> {
        let tenant_shard_id = self.tenant_shard_id;

        let resources = self.build_timeline_resources(new_timeline_id);
@@ -4110,7 +4133,7 @@ pub(crate) mod harness {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

            let tenant = Arc::new(Tenant::new(
-                TenantState::Loading,
+                TenantState::Attaching,
                self.conf,
                AttachedTenantConf::try_from(LocationConf::attached_single(
                    TenantConfOpt::from(self.tenant_conf.clone()),
@@ -4163,9 +4186,18 @@ pub(crate) mod harness {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
            if records_neon {
                // For Neon wal records, we can decode without spawning postgres, so do so.
-                let base_img = base_img.expect("Neon WAL redo requires base image").1;
-                let mut page = BytesMut::new();
-                page.extend_from_slice(&base_img);
+                let mut page = match (base_img, records.first()) {
+                    (Some((_lsn, img)), _) => {
+                        let mut page = BytesMut::new();
+                        page.extend_from_slice(&img);
+                        page
+                    }
+                    (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
+                    _ => {
+                        panic!("Neon WAL redo requires base image or will init record");
+                    }
+                };
+
                for (record_lsn, record) in records {
                    apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                }
@@ -7090,13 +7122,13 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: Key::MIN..Key::MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
-                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
+                // The delta layer below the horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: get_key(3)..get_key(4),
                    lsn_range: Lsn(0x30)..Lsn(0x48),
                    is_delta: true
                },
@@ -8469,4 +8501,135 @@ mod tests {

        Ok(())
    }
+
+    // Regression test for https://github.com/neondatabase/neon/issues/9012
+    // Create an image arrangement where we have to read at different LSN ranges
+    // from a delta layer. This is achieved by overlapping an image layer on top of
+    // a delta layer. Like so:
+    //
+    //     A      B
+    // +----------------+ -> delta_layer
+    // |                |                           ^ lsn
+    // |       =========|-> nested_image_layer      |
+    // |       C        |                           |
+    // +----------------+                           |
+    // ======== -> baseline_image_layer             +-------> key
+    //
+    //
+    // When querying the key range [A, B) we need to read at different LSN ranges
+    // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
+    #[tokio::test]
+    async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let mut delta_layer_spec = Vec::default();
+        let delta_layer_start_lsn = Lsn(0x20);
+        let mut delta_layer_end_lsn = delta_layer_start_lsn;
+
+        for i in 0..10 {
+            let key = get_key(i);
+            let key_in_nested = nested_img_layer
+                .iter()
+                .any(|(key_with_img, _)| *key_with_img == key);
+            let lsn = {
+                if key_in_nested {
+                    Lsn(nested_image_layer_lsn.0 + 0x10)
+                } else {
+                    delta_layer_start_lsn
+                }
+            };
+
+            let will_init = will_init_keys.contains(&i);
+            if will_init {
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+
+                expected_key_values.insert(key, "".to_string());
+            } else {
+                let delta = format!("@{lsn}");
+                delta_layer_spec.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+            }
+            delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
+        }
+
+        delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
+
+        assert!(
+            nested_image_layer_lsn > delta_layer_start_lsn
+                && nested_image_layer_lsn < delta_layer_end_lsn
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    delta_layer_start_lsn..delta_layer_end_lsn,
+                    delta_layer_spec,
+                )], // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                delta_layer_end_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value));
+        }
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/checks.rs
+++ b/pageserver/src/tenant/checks.rs
@@ -0,0 +1,55 @@
+use std::collections::BTreeSet;
+
+use itertools::Itertools;
+
+use super::storage_layer::LayerName;
+
+/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+///
+/// ```plain
+/// |       |                 |       |
+/// |   1   |    |   2   |    |   3   |
+/// |       |    |       |    |       |
+/// ```
+///
+/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
+/// the same LSN range.
+///
+/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
+///
+/// ```plain
+/// |       |    |   2   |    |       |
+/// |   1   |    |-------|    |   3   |
+/// |       |    |   4   |    |       |
+///
+/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
+    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+    let mut all_delta_layers = Vec::new();
+    for name in metadata {
+        if let LayerName::Delta(layer) = name {
+            if layer.key_range.start.next() != layer.key_range.end {
+                all_delta_layers.push(layer.clone());
+            }
+        }
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = &layer.lsn_range;
+        lsn_split_point.insert(lsn_range.start);
+        lsn_split_point.insert(lsn_range.end);
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = layer.lsn_range.clone();
+        let intersects = lsn_split_point.range(lsn_range).collect_vec();
+        if intersects.len() > 1 {
+            let err = format!(
+                "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                layer,
+                intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+            );
+            return Some(err);
+        }
+    }
+    None
+}
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,11 +9,10 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::LsnLease;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
@@ -23,50 +22,6 @@ use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;

-pub mod defaults {
-
-    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
-    // would be more appropriate. But a low value forces the code to be exercised more,
-    // which is good for now to trigger bugs.
-    // This parameter actually determines L0 layer file size.
-    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
-
-    // FIXME the below configs are only used by legacy algorithm. The new algorithm
-    // has different parameters.
-
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
-
-    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
-    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;
-
-    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-
-    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
-    // If there's a need to decrease this value, first make sure that GC
-    // doesn't hold a layer map write lock for non-trivial operations.
-    // Relevant: https://github.com/neondatabase/neon/issues/3394
-    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
-    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
-    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
-    // throughputs up to 1GiB/s per timeline.
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
-    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-    // By default ingest enough WAL for two new L0 layers before checking if new image
-    // image layers should be created.
-    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-}
-
 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
    /// Our generation is current as far as we know, and as far as we know we are the only attached
@@ -281,96 +236,20 @@ impl LocationConf {
    }
 }

-/// A tenant's calcuated configuration, which is the result of merging a
-/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
-///
-/// For storing and transmitting individual tenant's configuration, see
-/// TenantConfOpt.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct TenantConf {
-    // Flush out an inmemory layer, if it's holding WAL older than this
-    // This puts a backstop on how much WAL needs to be re-digested if the
-    // page server crashes.
-    // This parameter actually determines L0 layer file size.
-    pub checkpoint_distance: u64,
-    // Inmemory layer is also flushed at least once in checkpoint_timeout to
-    // eventually upload WAL after activity is stopped.
-    #[serde(with = "humantime_serde")]
-    pub checkpoint_timeout: Duration,
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub compaction_target_size: u64,
-    // How often to check if there's compaction work to be done.
-    // Duration::ZERO means automatic compaction is disabled.
-    #[serde(with = "humantime_serde")]
-    pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
-    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithmSettings,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is #of bytes of WAL.
-    // Page versions older than this are garbage collected away.
-    pub gc_horizon: u64,
-    // Interval at which garbage collection is triggered.
-    // Duration::ZERO means automatic GC is disabled
-    #[serde(with = "humantime_serde")]
-    pub gc_period: Duration,
-    // Delta layer churn threshold to create L1 image layers.
-    pub image_creation_threshold: usize,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is time.
-    // Page versions older than this are garbage collected away.
-    #[serde(with = "humantime_serde")]
-    pub pitr_interval: Duration,
-    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
-    #[serde(with = "humantime_serde")]
-    pub walreceiver_connect_timeout: Duration,
-    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
-    /// A stalled safekeeper will be changed to a newer one when it appears.
-    #[serde(with = "humantime_serde")]
-    pub lagging_wal_timeout: Duration,
-    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
-    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
-    /// to avoid eager reconnects.
-    pub max_lsn_wal_lag: NonZeroU64,
-    pub eviction_policy: EvictionPolicy,
-    pub min_resident_size_override: Option<u64>,
-    // See the corresponding metric's help string.
-    #[serde(with = "humantime_serde")]
-    pub evictions_low_residence_duration_metric_threshold: Duration,
-
-    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
-    /// may be disabled if a Tenant will not have secondary locations: only secondary
-    /// locations will use the heatmap uploaded by attached locations.
-    #[serde(with = "humantime_serde")]
-    pub heatmap_period: Duration,
-
-    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
-    pub lazy_slru_download: bool,
-
-    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
-
-    // How much WAL must be ingested before checking again whether a new image layer is required.
-    // Expresed in multiples of checkpoint distance.
-    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: AuxFilePolicy,
-
-    /// The length for an explicit LSN lease request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length: Duration,
-
-    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length_for_ts: Duration,
+impl Default for LocationConf {
+    // TODO: this should be removed once tenant loading can guarantee that we are never
+    // loading from a directory without a configuration.
+    // => tech debt since https://github.com/neondatabase/neon/issues/1555
+    fn default() -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation: Generation::none(),
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf: TenantConfOpt::default(),
+            shard: ShardIdentity::unsharded(),
+        }
+    }
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -545,51 +424,6 @@ impl TenantConfOpt {
    }
 }

-impl Default for TenantConf {
-    fn default() -> Self {
-        use defaults::*;
-        Self {
-            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
-                .expect("cannot parse default checkpoint timeout"),
-            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
-            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
-                .expect("cannot parse default compaction period"),
-            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
-            },
-            gc_horizon: DEFAULT_GC_HORIZON,
-            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
-                .expect("cannot parse default gc period"),
-            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
-                .expect("cannot parse default PITR interval"),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .expect("cannot parse default walreceiver connect timeout"),
-            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
-                .expect("cannot parse default walreceiver lagging wal timeout"),
-            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .expect("cannot parse default max walreceiver Lsn wal lag"),
-            eviction_policy: EvictionPolicy::NoEviction,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
-                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
-            )
-            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            heatmap_period: Duration::ZERO,
-            lazy_slru_download: false,
-            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
-            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
-            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
-            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
-        }
-    }
-}
-
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
    type Error = anyhow::Error;

@@ -618,7 +452,8 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
            toml_edit::Item::Table(table) => {
-                let deserializer = toml_edit::de::Deserializer::new(table.into());
+                let deserializer =
+                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
                return serde_path_to_error::deserialize(deserializer)
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,11 +1,29 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
+use std::{collections::HashMap, time::Duration};

 use super::remote_timeline_client::index::GcBlockingReason;
+use tokio::time::Instant;
+use utils::id::TimelineId;

-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;

+#[derive(Default)]
+struct Storage {
+    timelines_blocked: TimelinesBlocked,
+    /// The deadline before which we are blocked from GC so that
+    /// leases have a chance to be renewed.
+    lsn_lease_deadline: Option<Instant>,
+}
+
+impl Storage {
+    fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
+        self.lsn_lease_deadline
+            .map(|d| Instant::now() < d)
+            .unwrap_or(false)
+    }
+}
+
+/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
+/// blocking.
 #[derive(Default)]
 pub(crate) struct GcBlock {
    /// The timelines which have current reasons to block gc.
@@ -13,6 +31,12 @@ pub(crate) struct GcBlock {
    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
    reasons: std::sync::Mutex<Storage>,
+
+    /// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
+    ///
+    /// Do not add any more features taking and forbidding taking this lock. It should be
+    /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
+    /// synchronizes with gc attempts by locking and unlocking this mutex.
    blocking: tokio::sync::Mutex<()>,
 }

@@ -42,6 +66,20 @@ impl GcBlock {
        }
    }

+    /// Sets a deadline before which we cannot proceed to GC due to lsn lease.
+    ///
+    /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
+    /// length, we guarantee that all the leases we granted before will have a chance to renew
+    /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
+    pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
+        let deadline = Instant::now() + lsn_lease_length;
+        let mut g = self.reasons.lock().unwrap();
+        g.lsn_lease_deadline = Some(deadline);
+    }
+
+    /// Describe the current gc blocking reasons.
+    ///
+    /// TODO: make this json serializable.
    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
        let g = self.reasons.lock().unwrap();

@@ -64,7 +102,7 @@ impl GcBlock {
    ) -> anyhow::Result<bool> {
        let (added, uploaded) = {
            let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
+            let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
            let added = set.insert(reason);

            // LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -95,7 +133,7 @@ impl GcBlock {

        let (remaining_blocks, uploaded) = {
            let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
+            match g.timelines_blocked.entry(timeline.timeline_id) {
                Entry::Occupied(mut oe) => {
                    let set = oe.get_mut();
                    set.remove(reason);
@@ -109,7 +147,7 @@ impl GcBlock {
                }
            }

-            let remaining_blocks = g.len();
+            let remaining_blocks = g.timelines_blocked.len();

            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
            let uploaded = timeline
@@ -134,11 +172,11 @@ impl GcBlock {
    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
        let unblocked = {
            let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
+            if g.timelines_blocked.is_empty() {
                return;
            }

-            g.remove(&timeline.timeline_id);
+            g.timelines_blocked.remove(&timeline.timeline_id);

            BlockingReasons::clean_and_summarize(g).is_none()
        };
@@ -149,10 +187,11 @@ impl GcBlock {
    }

    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
+    pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
        let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+        assert!(g.timelines_blocked.is_empty());
+        g.timelines_blocked
+            .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));

        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
            tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -166,6 +205,7 @@ pub(super) struct Guard<'a> {

 #[derive(Debug)]
 pub(crate) struct BlockingReasons {
+    tenant_blocked_by_lsn_lease_deadline: bool,
    timelines: usize,
    reasons: enumset::EnumSet<GcBlockingReason>,
 }
@@ -174,8 +214,8 @@ impl std::fmt::Display for BlockingReasons {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
+            "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
+            self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
        )
    }
 }
@@ -183,13 +223,15 @@ impl std::fmt::Display for BlockingReasons {
 impl BlockingReasons {
    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
        let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
+        g.timelines_blocked.retain(|_key, value| {
            reasons = reasons.union(*value);
            !value.is_empty()
        });
-        if !g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
            Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                reasons,
            })
        } else {
@@ -198,14 +240,17 @@ impl BlockingReasons {
    }

    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
            None
        } else {
            let reasons = g
+                .timelines_blocked
                .values()
                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
            Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                reasons,
            })
        }
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,7 +1,8 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
-//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
-//! this struct and it's original serialization format is still needed because they were written a
-//! long time ago.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata.
+//!
+//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
+//! their timelines, this struct and its original serialization format is still needed because
+//! they were written a long time ago.
 //!
 //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
 //! versioning.
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -282,9 +282,10 @@ impl BackgroundPurges {
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

-/// The TenantManager is responsible for storing and mutating the collection of all tenants
-/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
-/// lives inside the TenantManager.
+/// Responsible for storing and mutating the collection of all tenants
+/// that this pageserver has state for.
+///
+/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
 ///
 /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
 /// the same tenant twice concurrently, or trying to configure the same tenant into secondary
@@ -948,6 +949,12 @@ impl TenantManager {
                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                    match attach_conf.generation.cmp(&tenant.generation) {
                        Ordering::Equal => {
+                            if attach_conf.attach_mode == AttachmentMode::Single {
+                                tenant
+                                    .gc_block
+                                    .set_lsn_lease_deadline(tenant.get_lsn_lease_length());
+                            }
+
                            // A transition from Attached to Attached in the same generation, we may
                            // take our fast path and just provide the updated configuration
                            // to the tenant.
@@ -2346,8 +2353,9 @@ pub enum TenantMapError {
    ShuttingDown,
 }

-/// Guards a particular tenant_id's content in the TenantsMap.  While this
-/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
+/// Guards a particular tenant_id's content in the TenantsMap.
+///
+/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2184,6 +2184,8 @@ pub fn remote_timeline_path(
    remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }

+/// Obtains the path of the given Layer in the remote
+///
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst(
        cancel,
    )
    .await
-    .map_err(|e| {
+    .inspect_err(|_e| {
        // Do a best-effort attempt at deleting the temporary file upon encountering an error.
        // We don't have async here nor do we want to pile on any extra errors.
        if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst(
                warn!("error deleting temporary file {temp_path}: {e}");
            }
        }
-        e
    })?;

    Ok((temp_path, file))
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -1,4 +1,5 @@
 //! In-memory index to track the tenant files on the remote storage.
+//!
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,13 +1,13 @@
 //! Common traits and structs for layers

 pub mod delta_layer;
+pub mod filter_iterator;
 pub mod image_layer;
 pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
-
 pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -276,6 +276,16 @@ pub(crate) enum LayerId {
    InMemoryLayerId(InMemoryLayerFileId),
 }

+/// Uniquely identify a layer visit by the layer
+/// and LSN floor (or start LSN) of the reads.
+/// The layer itself is not enough since we may
+/// have different LSN lower bounds for delta layer reads.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+struct LayerToVisitId {
+    layer_id: LayerId,
+    lsn_floor: Lsn,
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -287,9 +297,9 @@ pub(crate) enum ReadableLayer {

 /// A partial description of a read to be done.
 #[derive(Debug, Clone)]
-struct ReadDesc {
+struct LayerVisit {
    /// An id used to resolve the readable layer within the fringe
-    layer_id: LayerId,
+    layer_to_visit_id: LayerToVisitId,
    /// Lsn range for the read, used for selecting the next read
    lsn_range: Range<Lsn>,
 }
@@ -303,12 +313,12 @@ struct ReadDesc {
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<LayerId, LayerKeyspace>,
+    planned_visits_by_lsn: BinaryHeap<LayerVisit>,
+    visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
 }

 #[derive(Debug)]
-struct LayerKeyspace {
+struct LayerVisitReads {
    layer: ReadableLayer,
    target_keyspace: KeySpaceRandomAccum,
 }
@@ -316,23 +326,23 @@ struct LayerKeyspace {
 impl LayerFringe {
    pub(crate) fn new() -> Self {
        LayerFringe {
-            planned_reads_by_lsn: BinaryHeap::new(),
-            layers: HashMap::new(),
+            planned_visits_by_lsn: BinaryHeap::new(),
+            visit_reads: HashMap::new(),
        }
    }

    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_reads_by_lsn.pop() {
+        let read_desc = match self.planned_visits_by_lsn.pop() {
            Some(desc) => desc,
            None => return None,
        };

-        let removed = self.layers.remove_entry(&read_desc.layer_id);
+        let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);

        match removed {
            Some((
                _,
-                LayerKeyspace {
+                LayerVisitReads {
                    layer,
                    mut target_keyspace,
                },
@@ -351,20 +361,24 @@ impl LayerFringe {
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
    ) {
-        let layer_id = layer.id();
-        let entry = self.layers.entry(layer_id.clone());
+        let layer_to_visit_id = LayerToVisitId {
+            layer_id: layer.id(),
+            lsn_floor: lsn_range.start,
+        };
+
+        let entry = self.visit_reads.entry(layer_to_visit_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
                entry.get_mut().target_keyspace.add_keyspace(keyspace);
            }
            Entry::Vacant(entry) => {
-                self.planned_reads_by_lsn.push(ReadDesc {
+                self.planned_visits_by_lsn.push(LayerVisit {
                    lsn_range,
-                    layer_id: layer_id.clone(),
+                    layer_to_visit_id: layer_to_visit_id.clone(),
                });
                let mut accum = KeySpaceRandomAccum::new();
                accum.add_keyspace(keyspace);
-                entry.insert(LayerKeyspace {
+                entry.insert(LayerVisitReads {
                    layer,
                    target_keyspace: accum,
                });
@@ -379,7 +393,7 @@ impl Default for LayerFringe {
    }
 }

-impl Ord for ReadDesc {
+impl Ord for LayerVisit {
    fn cmp(&self, other: &Self) -> Ordering {
        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
        if ord == std::cmp::Ordering::Equal {
@@ -390,19 +404,19 @@ impl Ord for ReadDesc {
    }
 }

-impl PartialOrd for ReadDesc {
+impl PartialOrd for LayerVisit {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }

-impl PartialEq for ReadDesc {
+impl PartialEq for LayerVisit {
    fn eq(&self, other: &Self) -> bool {
        self.lsn_range == other.lsn_range
    }
 }

-impl Eq for ReadDesc {}
+impl Eq for LayerVisit {}

 impl ReadableLayer {
    pub(crate) fn id(&self) -> LayerId {
@@ -434,10 +448,11 @@ impl ReadableLayer {
    }
 }

-/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
-/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
-/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
-/// be used for cache management but not for correctness-critical checks.
+/// Layers contain a hint indicating whether they are likely to be used for reads.
+///
+/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
+/// when changing the visibility of layers (for example when creating a branch that makes some previously
+/// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
@@ -52,6 +52,7 @@ use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
@@ -135,10 +136,11 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;

-/// Struct representing reference to BLOB in layers. Reference contains BLOB
-/// offset, and for WAL records it also contains `will_init` flag. The flag
-/// helps to determine the range of records that needs to be applied, without
-/// reading/deserializing records themselves.
+/// Struct representing reference to BLOB in layers.
+///
+/// Reference contains BLOB offset, and for WAL records it also contains
+/// `will_init` flag. The flag helps to determine the range of records
+/// that needs to be applied, without reading/deserializing records themselves.
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);

@@ -1019,13 +1021,30 @@ impl DeltaLayerInner {
                    continue;
                }
            };
-
+            let view = BufView::new_slice(&blobs_buf.buf);
            for meta in blobs_buf.blobs.iter().rev() {
                if Some(meta.meta.key) == ignore_key_with_err {
                    continue;
                }
+                let blob_read = meta.read(&view).await;
+                let blob_read = match blob_read {
+                    Ok(buf) => buf,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::Other(anyhow!(e).context(format!(
+                                "Failed to decompress blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                let value = Value::des(&blob_read);

-                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
                let value = match value {
                    Ok(v) => v,
                    Err(e) => {
@@ -1241,21 +1260,21 @@ impl DeltaLayerInner {
                buf.reserve(read.size());
                let res = reader.read_blobs(&read, buf, ctx).await?;

+                let view = BufView::new_slice(&res.buf);
+
                for blob in res.blobs {
                    let key = blob.meta.key;
                    let lsn = blob.meta.lsn;
-                    let data = &res.buf[blob.start..blob.end];
+
+                    let data = blob.read(&view).await?;

                    #[cfg(debug_assertions)]
-                    Value::des(data)
+                    Value::des(&data)
                        .with_context(|| {
                            format!(
-                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
-                                blob.meta.key,
-                                blob.meta.lsn,
-                                blob.start,
-                                blob.end,
-                                utils::Hex(data)
+                                "blob failed to deserialize for {}: {:?}",
+                                blob,
+                                utils::Hex(&data)
                            )
                        })
                        .unwrap();
@@ -1263,15 +1282,15 @@ impl DeltaLayerInner {
                    // is it an image or will_init walrecord?
                    // FIXME: this could be handled by threading the BlobRef to the
                    // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(data)
+                    let will_init = crate::repository::ValueBytes::will_init(&data)
                        .inspect_err(|_e| {
                            #[cfg(feature = "testing")]
-                            tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
+                            tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
                        })
                        .unwrap_or(false);

                    per_blob_copy.clear();
-                    per_blob_copy.extend_from_slice(data);
+                    per_blob_copy.extend_from_slice(&data);

                    let (tmp, res) = writer
                        .put_value_bytes(
@@ -1536,8 +1555,11 @@ impl<'a> DeltaLayerIterator<'a> {
            .read_blobs(&plan, buf, self.ctx)
            .await?;
        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
-            let value = Value::des(&frozen_buf[meta.start..meta.end])?;
+            let blob_read = meta.read(&view).await?;
+            let value = Value::des(&blob_read)?;
+
            next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
        }
        self.key_values_batch = next_batch;
@@ -1914,9 +1936,13 @@ pub(crate) mod test {
                let blobs_buf = vectored_blob_reader
                    .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                    .await?;
+                let view = BufView::new_slice(&blobs_buf.buf);
                for meta in blobs_buf.blobs.iter() {
-                    let value = &blobs_buf.buf[meta.start..meta.end];
-                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
+                    let value = meta.read(&view).await?;
+                    assert_eq!(
+                        &value[..],
+                        &entries_meta.index[&(meta.meta.key, meta.meta.lsn)]
+                    );
                }

                buf = Some(blobs_buf.buf);
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -0,0 +1,205 @@
+use std::ops::Range;
+
+use anyhow::bail;
+use pageserver_api::{
+    key::Key,
+    keyspace::{KeySpace, SparseKeySpace},
+};
+use utils::lsn::Lsn;
+
+use crate::repository::Value;
+
+use super::merge_iterator::MergeIterator;
+
+/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
+///
+/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
+/// to be retained.
+pub struct FilterIterator<'a> {
+    inner: MergeIterator<'a>,
+    retain_key_filters: Vec<Range<Key>>,
+    current_filter_idx: usize,
+}
+
+impl<'a> FilterIterator<'a> {
+    pub fn create(
+        inner: MergeIterator<'a>,
+        dense_keyspace: KeySpace,
+        sparse_keyspace: SparseKeySpace,
+    ) -> anyhow::Result<Self> {
+        let mut retain_key_filters = Vec::new();
+        retain_key_filters.extend(dense_keyspace.ranges);
+        retain_key_filters.extend(sparse_keyspace.0.ranges);
+        retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
+        // Verify key filters are non-overlapping and sorted
+        for window in retain_key_filters.windows(2) {
+            if window[0].end > window[1].start {
+                bail!(
+                    "Key filters are overlapping: {:?} and {:?}",
+                    window[0],
+                    window[1]
+                );
+            }
+        }
+        Ok(Self {
+            inner,
+            retain_key_filters,
+            current_filter_idx: 0,
+        })
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(item) = self.inner.next().await? {
+            while self.current_filter_idx < self.retain_key_filters.len()
+                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+            {
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                    ^ current filter
+                self.current_filter_idx += 1;
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                                        ^ current filter
+            }
+            if self.current_filter_idx >= self.retain_key_filters.len() {
+                // We already exhausted all filters, so we should return now
+                // [filter region] [filter region] [filter region]
+                //                                                    ^ item
+                //                                                 ^ current filter (nothing)
+                return Ok(None);
+            }
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+                // [filter region]    [filter region]     [filter region]
+                //                                              ^ item
+                //                                        ^ current filter
+                return Ok(Some(item));
+            }
+            // If the key is not contained in the key retaining filters, continue to the next item.
+            // [filter region]    [filter region]     [filter region]
+            //                                     ^ item
+            //                                        ^ current filter
+        }
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use itertools::Itertools;
+    use pageserver_api::key::Key;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::delta_layer::test::produce_delta_layer,
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    async fn assert_filter_iter_equal(
+        filter_iter: &mut FilterIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = filter_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn filter_keyspace_iterator() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 100;
+        let test_deltas1 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(5)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(110),
+                    get_key(1000)..get_key(2000),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[5..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..100].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(0)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(95),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[0..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..95].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+    }
+}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1,7 +1,9 @@
 //! An ImageLayer represents an image or a snapshot of a key-range at
-//! one particular LSN. It contains an image of all key-value pairs
-//! in its key-range. Any key that falls into the image layer's range
-//! but does not exist in the layer, does not exist.
+//! one particular LSN.
+//!
+//! It contains an image of all key-value pairs in its key-range. Any key
+//! that falls into the image layer's range but does not exist in the layer,
+//! does not exist.
 //!
 //! An image layer is stored in a file on disk. The file is stored in
 //! timelines/<timeline_id> directory.  Currently, there are no
@@ -34,10 +36,10 @@ use crate::tenant::disk_btree::{
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -46,6 +48,7 @@ use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
@@ -56,7 +59,6 @@ use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -68,9 +70,7 @@ use utils::{
 };

 use super::layer_name::ImageLayerName;
-use super::{
-    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};

 ///
 /// Header stored in the beginning of the file
@@ -548,15 +548,15 @@ impl ImageLayerInner {

            let buf = BytesMut::with_capacity(buf_size);
            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
            let frozen_buf = blobs_buf.buf.freeze();
+            let view = BufView::new_bytes(frozen_buf);

            for meta in blobs_buf.blobs.iter() {
-                let img_buf = frozen_buf.slice(meta.start..meta.end);
+                let img_buf = meta.read(&view).await?;

                key_count += 1;
                writer
-                    .put_image(meta.meta.key, img_buf, ctx)
+                    .put_image(meta.meta.key, img_buf.into_bytes(), ctx)
                    .await
                    .context(format!("Storing key {}", meta.meta.key))?;
            }
@@ -603,13 +603,28 @@ impl ImageLayerInner {
            match res {
                Ok(blobs_buf) => {
                    let frozen_buf = blobs_buf.buf.freeze();
-
+                    let view = BufView::new_bytes(frozen_buf);
                    for meta in blobs_buf.blobs.iter() {
-                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        let img_buf = meta.read(&view).await;
+
+                        let img_buf = match img_buf {
+                            Ok(img_buf) => img_buf,
+                            Err(e) => {
+                                reconstruct_state.on_key_error(
+                                    meta.meta.key,
+                                    PageReconstructError::Other(anyhow!(e).context(format!(
+                                        "Failed to decompress blob from virtual file {}",
+                                        self.file.path,
+                                    ))),
+                                );
+
+                                continue;
+                            }
+                        };
                        reconstruct_state.update_key(
                            &meta.meta.key,
                            self.lsn,
-                            Value::Image(img_buf),
+                            Value::Image(img_buf.into_bytes()),
                        );
                    }
                }
@@ -798,10 +813,9 @@ impl ImageLayerWriterInner {
    ///
    async fn finish(
        self,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Option<Key>,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -877,12 +891,9 @@ impl ImageLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        // FIXME: why not carry the virtualfile here, it supports renaming?
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created image layer {}", self.path);

-        info!("created image layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
    }
 }

@@ -961,24 +972,18 @@ impl ImageLayerWriter {
    ///
    pub(crate) async fn finish(
        mut self,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, None).await
    }

    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
-        timeline: &Arc<Timeline>,
        end_key: Key,
        ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, Some(end_key)).await
    }
 }

@@ -1036,10 +1041,15 @@ impl<'a> ImageLayerIterator<'a> {
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let frozen_buf: Bytes = blobs_buf.buf.freeze();
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
-            let img_buf = frozen_buf.slice(meta.start..meta.end);
-            next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
+            let img_buf = meta.read(&view).await?;
+            next_batch.push_back((
+                meta.meta.key,
+                self.image_layer.lsn,
+                Value::Image(img_buf.into_bytes()),
+            ));
        }
        self.key_values_batch = next_batch;
        Ok(())
@@ -1082,7 +1092,7 @@ mod test {
        tenant::{
            config::TenantConf,
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::ResidentLayer,
+            storage_layer::{Layer, ResidentLayer},
            vectored_blob_io::StreamingVectoredReadPlanner,
            Tenant, Timeline,
        },
@@ -1153,7 +1163,8 @@ mod test {

                key = key.next();
            }
-            writer.finish(&timeline, &ctx).await.unwrap()
+            let (desc, path) = writer.finish(&ctx).await.unwrap();
+            Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
        };
        let original_size = resident.metadata().file_size;

@@ -1215,7 +1226,9 @@ mod test {
                .await
                .unwrap();
            let replacement = if wrote_keys > 0 {
-                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+                let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
+                let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
+                Some(resident)
            } else {
                None
            };
@@ -1288,7 +1301,8 @@ mod test {
        for (key, img) in images {
            writer.put_image(key, img, ctx).await?;
        }
-        let img_layer = writer.finish(tline, ctx).await?;
+        let (desc, path) = writer.finish(ctx).await?;
+        let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;

        Ok::<_, anyhow::Error>(img_layer)
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -215,7 +215,7 @@ impl IndexEntry {

    const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
        let res = Self::validate_checkpoint_distance(
-            crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
        );
        if res.is_err() {
            panic!("default checkpoint distance is valid")
@@ -692,8 +692,13 @@ impl InMemoryLayer {
            let vec_map = inner.index.entry(key).or_default();
            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
            if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!("Key {} at {} already exists", key, lsn);
+                // This should not break anything, but is unexpected: ingestion code aims to filter out
+                // multiple writes to the same key at the same LSN.  This happens in cases where our
+                // ingenstion code generates some write like an empty page, and we see a write from postgres
+                // to the same key in the same wal record.  If one such write makes it through, we
+                // index the most recent write, implicitly ignoring the earlier write.  We log a warning
+                // because this case is unexpected, and we would like tests to fail if this happens.
+                warn!("Key {} at {} written twice at same LSN", key, lsn);
            }
        }

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -439,11 +439,30 @@ impl Layer {

    fn record_access(&self, ctx: &RequestContext) {
        if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
+            // Visibility was modified to Visible: maybe log about this
+            match ctx.task_kind() {
+                TaskKind::CalculateSyntheticSize
+                | TaskKind::GarbageCollector
+                | TaskKind::MgmtRequest => {
+                    // This situation is expected in code paths do binary searches of the LSN space to resolve
+                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
+                    // and on-demand for certain HTTP API requests.
+                }
+                _ => {
+                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
+                    // some branch tip, so we log the fact that we are accessing something that the visibility
+                    // calculation thought should not be visible.
+                    //
+                    // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
+                    // which was covered by a concurrent compaction.
+                    tracing::info!(
+                        "Layer {} became visible as a result of access",
+                        self.0.desc.key()
+                    );
+                }
+            }
+
+            // Update the timeline's visible bytes count
            if let Some(tl) = self.0.timeline.upgrade() {
                tl.metrics
                    .visible_physical_size_gauge
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1025,6 +1025,15 @@ fn access_stats() {
    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
    access_stats.set_visibility(LayerVisibilityHint::Visible);
    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+
+    // Recording access implicitly makes layer visible, if it wasn't already
+    let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
+    assert!(access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert!(!access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
 }

 #[test]
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;

-/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
-/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// A unique identifier of a persistent layer.
+///
+/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
+/// This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName {
    }
 }

-/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
-/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
+///
+/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
 /// over time (e.g. across shard splits or compression). The physical filenames of layers in local
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -226,9 +226,11 @@ impl<'a> IteratorWrapper<'a> {
    }
 }

-/// A merge iterator over delta/image layer iterators. When duplicated records are
-/// found, the iterator will not perform any deduplication, and the caller should handle
-/// these situation. By saying duplicated records, there are many possibilities:
+/// A merge iterator over delta/image layer iterators.
+///
+/// When duplicated records are found, the iterator will not perform any
+/// deduplication, and the caller should handle these situation. By saying
+/// duplicated records, there are many possibilities:
 ///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -34,9 +34,10 @@ impl SplitWriterResult {
    }
 }

-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
+/// An image writer that takes images and produces multiple image layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the image layer generation
+/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
@@ -120,11 +121,11 @@ impl SplitImageLayerWriter {
                self.generated_layers
                    .push(SplitWriterResult::Discarded(layer_key));
            } else {
-                self.generated_layers.push(SplitWriterResult::Produced(
-                    prev_image_writer
-                        .finish_with_end_key(tline, key, ctx)
-                        .await?,
-                ));
+                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
+
+                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                self.generated_layers
+                    .push(SplitWriterResult::Produced(layer));
            }
        }
        self.inner.put_image(key, img, ctx).await
@@ -169,9 +170,9 @@ impl SplitImageLayerWriter {
        if discard(&layer_key).await {
            generated_layers.push(SplitWriterResult::Discarded(layer_key));
        } else {
-            generated_layers.push(SplitWriterResult::Produced(
-                inner.finish_with_end_key(tline, end_key, ctx).await?,
-            ));
+            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
+            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(layer));
        }
        Ok(generated_layers)
    }
@@ -187,22 +188,23 @@ impl SplitImageLayerWriter {
            .await
    }

-    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    /// This function will be deprecated with #8841.
    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
        Ok((self.generated_layers, self.inner))
    }
 }

-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
+/// A delta writer that takes key-lsn-values and produces multiple delta layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
+/// there might be leftover files to be cleaned up).
 ///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
+    inner: Option<(Key, DeltaLayerWriter)>,
    target_layer_size: u64,
    generated_layers: Vec<SplitWriterResult>,
    conf: &'static PageServerConf,
@@ -210,7 +212,6 @@ pub struct SplitDeltaLayerWriter {
    tenant_shard_id: TenantShardId,
    lsn_range: Range<Lsn>,
    last_key_written: Key,
-    start_key: Key,
 }

 impl SplitDeltaLayerWriter {
@@ -218,29 +219,18 @@ impl SplitDeltaLayerWriter {
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
-        start_key: Key,
        lsn_range: Range<Lsn>,
        target_layer_size: u64,
-        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
+            inner: None,
            generated_layers: Vec::new(),
            conf,
            timeline_id,
            tenant_shard_id,
            lsn_range,
            last_key_written: Key::MIN,
-            start_key,
        })
    }

@@ -263,9 +253,26 @@ impl SplitDeltaLayerWriter {
        //
        // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
        // strategy. https://github.com/neondatabase/neon/issues/8837
+
+        if self.inner.is_none() {
+            self.inner = Some((
+                key,
+                DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    key,
+                    self.lsn_range.clone(),
+                    ctx,
+                )
+                .await?,
+            ));
+        }
+        let (_, inner) = self.inner.as_mut().unwrap();
+
        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        if inner.num_keys() >= 1
+            && inner.estimated_size() + addition_size_estimation >= self.target_layer_size
        {
            if key != self.last_key_written {
                let next_delta_writer = DeltaLayerWriter::new(
@@ -277,13 +284,13 @@ impl SplitDeltaLayerWriter {
                    ctx,
                )
                .await?;
-                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+                let (start_key, prev_delta_writer) =
+                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
                let layer_key = PersistentLayerKey {
-                    key_range: self.start_key..key,
+                    key_range: start_key..key,
                    lsn_range: self.lsn_range.clone(),
                    is_delta: true,
                };
-                self.start_key = key;
                if discard(&layer_key).await {
                    drop(prev_delta_writer);
                    self.generated_layers
@@ -294,17 +301,18 @@ impl SplitDeltaLayerWriter {
                    self.generated_layers
                        .push(SplitWriterResult::Produced(delta_layer));
                }
-            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
+            } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                // We have to produce a very large file b/c a key is updated too often.
                anyhow::bail!(
                    "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
                    key,
-                    self.inner.estimated_size()
+                    inner.estimated_size()
                );
            }
        }
        self.last_key_written = key;
-        self.inner.put_value(key, lsn, val, ctx).await
+        let (_, inner) = self.inner.as_mut().unwrap();
+        inner.put_value(key, lsn, val, ctx).await
    }

    pub async fn put_value(
@@ -323,7 +331,6 @@ impl SplitDeltaLayerWriter {
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-        end_key: Key,
        discard: D,
    ) -> anyhow::Result<Vec<SplitWriterResult>>
    where
@@ -335,11 +342,15 @@ impl SplitDeltaLayerWriter {
            inner,
            ..
        } = self;
+        let Some((start_key, inner)) = inner else {
+            return Ok(generated_layers);
+        };
        if inner.num_keys() == 0 {
            return Ok(generated_layers);
        }
+        let end_key = self.last_key_written.next();
        let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
+            key_range: start_key..end_key,
            lsn_range: self.lsn_range.clone(),
            is_delta: true,
        };
@@ -358,15 +369,14 @@ impl SplitDeltaLayerWriter {
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-        end_key: Key,
    ) -> anyhow::Result<Vec<SplitWriterResult>> {
-        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+        self.finish_with_discard_fn(tline, ctx, |_| async { false })
            .await
    }

-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
+    /// This function will be deprecated with #8841.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
+        Ok((self.generated_layers, self.inner.map(|x| x.1)))
    }
 }

@@ -430,10 +440,8 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
-            get_key(0),
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
-            &ctx,
        )
        .await
        .unwrap();
@@ -458,11 +466,22 @@ mod tests {
            )
            .await
            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
        assert_eq!(layers.len(), 1);
+        assert_eq!(
+            layers
+                .into_iter()
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
    }

    #[tokio::test]
@@ -499,10 +518,8 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
-            get_key(0),
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
-            &ctx,
        )
        .await
        .unwrap();
@@ -531,10 +548,7 @@ mod tests {
            .finish(&tline, &ctx, get_key(N as u32))
            .await
            .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
        if discard {
            for layer in image_layers {
                layer.into_discarded_layer();
@@ -553,6 +567,14 @@ mod tests {
                .collect_vec();
            assert_eq!(image_layers.len(), N / 512 + 1);
            assert_eq!(delta_layers.len(), N / 512 + 1);
+            assert_eq!(
+                delta_layers.first().unwrap().layer_desc().key_range.start,
+                get_key(0)
+            );
+            assert_eq!(
+                delta_layers.last().unwrap().layer_desc().key_range.end,
+                get_key(N as u32)
+            );
            for idx in 0..image_layers.len() {
                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
@@ -600,10 +622,8 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
-            get_key(0),
            Lsn(0x18)..Lsn(0x20),
            4 * 1024,
-            &ctx,
        )
        .await
        .unwrap();
@@ -642,11 +662,35 @@ mod tests {
            )
            .await
            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
        assert_eq!(layers.len(), 2);
+        let mut layers_iter = layers.into_iter();
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(1)..get_key(2),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
    }

    #[tokio::test]
@@ -666,10 +710,8 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
-            get_key(0),
            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
            4 * 1024 * 1024,
-            &ctx,
        )
        .await
        .unwrap();
@@ -687,10 +729,20 @@ mod tests {
                .await
                .unwrap();
        }
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
        assert_eq!(delta_layers.len(), 1);
+        let delta_layer = delta_layers
+            .into_iter()
+            .next()
+            .unwrap()
+            .into_resident_layer();
+        assert_eq!(
+            delta_layer.layer_desc().key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
+                is_delta: true
+            }
+        );
    }
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
@@ -164,8 +163,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    // How many errors we have seen consequtively
    let mut error_run_count = 0;

-    let mut last_throttle_flag_reset_at = Instant::now();
-
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -192,8 +189,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            }

-
-
            let sleep_duration;
            if period == Duration::ZERO {
                #[cfg(not(feature = "testing"))]
@@ -208,12 +203,18 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                };

                // Run compaction
-                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
+                let IterationResult { output, elapsed } = iteration
+                    .run(tenant.compaction_iteration(&cancel, &ctx))
+                    .await;
                match output {
                    Ok(has_pending_task) => {
                        error_run_count = 0;
                        // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
+                        sleep_duration = if has_pending_task {
+                            Duration::ZERO
+                        } else {
+                            period
+                        };
                    }
                    Err(e) => {
                        let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -234,38 +235,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }

                // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
+                tracing::debug!(
+                    elapsed_ms = elapsed.as_millis(),
+                    "compaction iteration complete"
+                );
            };

-
            // Perhaps we did no work and the walredo process has been idle for some time:
            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
            if let Some(walredo_mgr) = &tenant.walredo_mgr {
                walredo_mgr.maybe_quiesce(period * 10);
            }

-            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
-                if count_throttled == 0 {
-                    return;
-                }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
-                    count_accounted,
-                    count_throttled,
-                    sum_throttled_usecs,
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
-
            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
@@ -347,6 +330,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);

        let mut first = true;
+        tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
        loop {
            tokio::select! {
                _ = cancel.cancelled() => {
@@ -364,7 +348,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                first = false;

                let delays = async {
-                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
                    random_init_delay(period, &cancel).await?;
                    Ok::<_, Cancelled>(())
                };
@@ -438,6 +421,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
+    let mut last_throttle_flag_reset_at = Instant::now();
        loop {
            tokio::select! {
                _ = cancel.cancelled() => {
@@ -456,9 +440,11 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken

            // If compaction period is set to zero (to disable it), then we will use a reasonable default
            let period = if period == Duration::ZERO {
-                humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
-                    .unwrap()
-                    .into()
+                humantime::Duration::from_str(
+                    pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
+                )
+                .unwrap()
+                .into()
            } else {
                period
            };
@@ -482,6 +468,28 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                kind: BackgroundLoopKind::IngestHouseKeeping,
            };
            iteration.run(tenant.ingest_housekeeping()).await;
+
+            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
+            // Or just spawn another background loop for this throttle, it's not like it's super costly.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                info!(
+                    n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
+                    count_accounted = count_accounted_finish,  // don't break existing log scraping
+                    count_throttled,
+                    sum_throttled_usecs,
+                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds"
+                );
+            });
        }
    }
    .await;
@@ -537,28 +545,12 @@ pub(crate) async fn random_init_delay(
        let mut rng = rand::thread_rng();
        rng.gen_range(Duration::ZERO..=period)
    };
-
    match tokio::time::timeout(d, cancel.cancelled()).await {
        Ok(_) => Err(Cancelled),
        Err(_) => Ok(()),
    }
 }

-/// Delays GC by defaul lease length at restart.
-///
-/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
-/// length, we gurantees that all the leases we granted before the restart will expire
-/// when we run GC for the first time after the restart.
-pub(crate) async fn delay_by_lease_length(
-    length: Duration,
-    cancel: &CancellationToken,
-) -> Result<(), Cancelled> {
-    match tokio::time::timeout(length, cancel.cancelled()).await {
-        Ok(_) => Err(Cancelled),
-        Err(_) => Ok(()),
-    }
-}
-
 struct Iteration {
    started_at: Instant,
    period: Duration,
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
 pub struct Throttle<M: Metric> {
    inner: ArcSwap<Inner>,
    metric: M,
-    /// will be turned into [`Stats::count_accounted`]
-    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_start`]
+    count_accounted_start: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_finish`]
+    count_accounted_finish: AtomicU64,
    /// will be turned into [`Stats::count_throttled`]
    count_throttled: AtomicU64,
    /// will be turned into [`Stats::sum_throttled_usecs`]
@@ -43,17 +45,21 @@ pub struct Observation {
    pub wait_time: Duration,
 }
 pub trait Metric {
+    fn accounting_start(&self);
+    fn accounting_finish(&self);
    fn observe_throttling(&self, observation: &Observation);
 }

 /// See [`Throttle::reset_stats`].
 pub struct Stats {
-    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
-    pub count_accounted: u64,
-    // Subset of the `accounted` requests that were actually throttled.
-    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    /// Number of requests that started [`Throttle::throttle`] calls.
+    pub count_accounted_start: u64,
+    /// Number of requests that finished [`Throttle::throttle`] calls.
+    pub count_accounted_finish: u64,
+    /// Subset of the `accounted` requests that were actually throttled.
+    /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
    pub count_throttled: u64,
-    // Sum of microseconds that throttled requests spent waiting for throttling.
+    /// Sum of microseconds that throttled requests spent waiting for throttling.
    pub sum_throttled_usecs: u64,
 }

@@ -65,7 +71,8 @@ where
        Self {
            inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
            metric,
-            count_accounted: AtomicU64::new(0),
+            count_accounted_start: AtomicU64::new(0),
+            count_accounted_finish: AtomicU64::new(0),
            count_throttled: AtomicU64::new(0),
            sum_throttled_usecs: AtomicU64::new(0),
        }
@@ -117,11 +124,13 @@ where
    /// This method allows retrieving & resetting that flag.
    /// Useful for periodic reporting.
    pub fn reset_stats(&self) -> Stats {
-        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
+        let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
        let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
        let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
        Stats {
-            count_accounted,
+            count_accounted_start,
+            count_accounted_finish,
            count_throttled,
            sum_throttled_usecs,
        }
@@ -139,9 +148,12 @@ where
        };
        let start = std::time::Instant::now();

+        self.metric.accounting_start();
+        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
        let did_throttle = inner.rate_limiter.acquire(key_count).await;
+        self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
+        self.metric.accounting_finish();

-        self.count_accounted.fetch_add(1, Ordering::Relaxed);
        if did_throttle {
            self.count_throttled.fetch_add(1, Ordering::Relaxed);
            let now = Instant::now();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,7 +66,6 @@ use std::{
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
-        config::defaults::DEFAULT_PITR_INTERVAL,
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
@@ -102,6 +101,7 @@ use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
@@ -196,9 +196,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    pub timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }

@@ -406,9 +405,8 @@ pub struct Timeline {
    gc_lock: tokio::sync::Mutex<()>,

    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,

    /// Keep aux directory cache to avoid it's reconstruction on each update
    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
@@ -2243,7 +2241,7 @@ impl Timeline {
            };

            if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1");
+                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
            }

            result.repartition_threshold =
@@ -4013,7 +4011,9 @@ impl Timeline {
        if wrote_keys {
            // Normal path: we have written some data into the new image layer for this
            // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!("created image layer for rel {}", image_layer.local_path());
            Ok(ImageLayerCreationOutcome {
                image: Some(image_layer),
                next_start_key: img_range.end,
@@ -4101,7 +4101,12 @@ impl Timeline {
        if wrote_any_image {
            // Normal path: we have written some data into the new image layer for this
            // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!(
+                "created image layer for metadata {}",
+                image_layer.local_path()
+            );
            Ok(ImageLayerCreationOutcome {
                image: Some(image_layer),
                next_start_key: img_range.end,
@@ -4309,7 +4314,9 @@ impl Timeline {
        timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await?;
+        if !image_layers.is_empty() {
+            self.update_layer_visibility().await?;
+        }

        Ok(image_layers)
    }
@@ -5371,7 +5378,8 @@ impl Timeline {
    /// Force create an image layer and place it into the layer map.
    ///
    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_image_layer(
        self: &Arc<Timeline>,
@@ -5403,8 +5411,9 @@ impl Timeline {
        for (key, img) in images {
            image_layer_writer.put_image(key, img, ctx).await?;
        }
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
-
+        let (desc, path) = image_layer_writer.finish(ctx).await?;
+        let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+        info!("force created image layer {}", image_layer.local_path());
        {
            let mut guard = self.layers.write().await;
            guard.open_mut().unwrap().force_insert_layer(image_layer);
@@ -5416,7 +5425,8 @@ impl Timeline {
    /// Force create a delta layer and place it into the layer map.
    ///
    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_delta_layer(
        self: &Arc<Timeline>,
@@ -5442,33 +5452,6 @@ impl Timeline {
        if let Some(check_start_lsn) = check_start_lsn {
            assert!(deltas.lsn_range.start >= check_start_lsn);
        }
-        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
-        // layers of the same start/end LSN, and so should the force inserted layer
-        {
-            /// Checks if a overlaps with b, assume a/b = [start, end).
-            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-
-            if deltas.key_range.start.next() != deltas.key_range.end {
-                let guard = self.layers.read().await;
-                let mut invalid_layers =
-                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
-                        layer.is_delta()
-                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                        && layer.lsn_range != deltas.lsn_range
-                        // skip single-key layer files
-                        && layer.key_range.start.next() != layer.key_range.end
-                    });
-                if let Some(layer) = invalid_layers.next() {
-                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
-                    panic!(
-                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
-                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
-                    );
-                }
-            }
-        }
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
@@ -5483,7 +5466,7 @@ impl Timeline {
        }
        let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-
+        info!("force created delta layer {}", delta_layer.local_path());
        {
            let mut guard = self.layers.write().await;
            guard.open_mut().unwrap().force_insert_layer(delta_layer);
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,8 +29,9 @@ use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
+use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
+use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
@@ -43,6 +44,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use pageserver_api::config::tenant_conf_defaults::{
+    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
+};

 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
@@ -561,10 +565,12 @@ impl Timeline {
                .await?;

            if keys_written > 0 {
-                let new_layer = image_layer_writer
-                    .finish(self, ctx)
+                let (desc, path) = image_layer_writer
+                    .finish(ctx)
                    .await
                    .map_err(CompactionError::Other)?;
+                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .map_err(CompactionError::Other)?;
                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                    layer.metadata().file_size,
                    new_layer.metadata().file_size);
@@ -909,137 +915,13 @@ impl Timeline {
        // we're compacting, in key, LSN order.
        // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
        // then the Value::Image is ordered before Value::WalRecord.
-        //
-        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
-        // option and validation code once we've reached confidence.
-        enum AllValuesIter<'a> {
-            PageCachedBlobIo {
-                all_keys_iter: VecIter<'a>,
-            },
-            StreamingKmergeBypassingPageCache {
-                merge_iter: MergeIterator<'a>,
-            },
-            ValidatingStreamingKmergeBypassingPageCache {
-                mode: CompactL0BypassPageCacheValidation,
-                merge_iter: MergeIterator<'a>,
-                all_keys_iter: VecIter<'a>,
-            },
-        }
-        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
-        impl AllValuesIter<'_> {
-            async fn next_all_keys_iter(
-                iter: &mut VecIter<'_>,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                let Some(DeltaEntry {
-                    key,
-                    lsn,
-                    val: value_ref,
-                    ..
-                }) = iter.next()
-                else {
-                    return Ok(None);
-                };
-                let value = value_ref.load(ctx).await?;
-                Ok(Some((*key, *lsn, value)))
-            }
-            async fn next(
-                &mut self,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                match self {
-                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
-                      Self::next_all_keys_iter(iter, ctx).await
-                    }
-                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
-                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
-                        // advance both iterators
-                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
-                        let merge_iter_item = merge_iter.next().await;
-                        // compare results & log warnings as needed
-                        macro_rules! rate_limited_warn {
-                            ($($arg:tt)*) => {{
-                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
-                                    warn!($($arg)*);
-                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
-                                }
-                                use once_cell::sync::Lazy;
-                                use utils::rate_limit::RateLimit;
-                                use std::sync::Mutex;
-                                use std::time::Duration;
-                                static LOGGED: Lazy<Mutex<RateLimit>> =
-                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                                let mut rate_limit = LOGGED.lock().unwrap();
-                                rate_limit.call(|| {
-                                    warn!($($arg)*);
-                                });
-                            }}
-                        }
-                        match (&all_keys_iter_item, &merge_iter_item) {
-                            (Err(_), Err(_)) => {
-                                // don't bother asserting equivality of the errors
-                            }
-                            (Err(all_keys), Ok(merge)) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
-                            },
-                            (Ok(all_keys), Err(merge)) => {
-                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
-                            },
-                            (Ok(None), Ok(None)) => { }
-                            (Ok(Some(all_keys)), Ok(None)) => {
-                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
-                            }
-                            (Ok(None), Ok(Some(merge))) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
-                            }
-                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
-                                match mode {
-                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
-                                    CompactL0BypassPageCacheValidation::KeyLsn => {
-                                        let all_keys = (all_keys_key, all_keys_lsn);
-                                        let merge = (merge_key, merge_lsn);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
-                                        }
-                                    }
-                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
-                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
-                                        let merge = (merge_key, merge_lsn, merge_value);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        // in case of mismatch, trust the legacy all_keys_iter_item
-                        all_keys_iter_item
-                    }.instrument(info_span!("next")).await
-                }
-            }
-        }
-        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
-            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
-                all_keys_iter: all_keys.iter(),
-            },
-            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
-                let merge_iter = {
-                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
-                    for l in deltas_to_compact.iter() {
-                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                        deltas.push(l);
-                    }
-                    MergeIterator::create(&deltas, &[], ctx)
-                };
-                match validate {
-                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
-                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
-                        mode: validate.clone(),
-                        merge_iter,
-                        all_keys_iter: all_keys.iter(),
-                    },
-                }
+        let mut all_values_iter = {
+            let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+            for l in deltas_to_compact.iter() {
+                let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                deltas.push(l);
            }
+            MergeIterator::create(&deltas, &[], ctx)
        };

        // This iterator walks through all keys and is needed to calculate size used by each key
@@ -1116,7 +998,7 @@ impl Timeline {
        let mut keys = 0;

        while let Some((key, lsn, value)) = all_values_iter
-            .next(ctx)
+            .next()
            .await
            .map_err(CompactionError::Other)?
        {
@@ -1433,43 +1315,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
    }
 }

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 impl Timeline {
    /// Entry point for new tiered compaction algorithm.
    ///
@@ -1928,6 +1773,7 @@ impl Timeline {
            gc_cutoff,
            lowest_retain_lsn
        );
+
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
@@ -1945,20 +1791,12 @@ impl Timeline {
                stat.visit_image_layer(desc.file_size());
            }
        }
-        for layer in &layer_selection {
-            let desc = layer.layer_desc();
-            let key_range = &desc.key_range;
-            if desc.is_delta() && key_range.start.next() != key_range.end {
-                let lsn_range = desc.lsn_range.clone();
-                let intersects = lsn_split_point.range(lsn_range).collect_vec();
-                if intersects.len() > 1 {
-                    bail!(
-                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        desc.key(),
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-                }
-            }
+        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+            .iter()
+            .map(|layer| layer.layer_desc().layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("cannot run gc-compaction because {}", err);
        }
        // The maximum LSN we are processing in this compaction loop
        let end_lsn = layer_selection
@@ -1968,7 +1806,6 @@ impl Timeline {
            .unwrap();
        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
        // as an L0 layer.
-        let hack_end_key = Key::NON_L0_MAX;
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
@@ -1985,7 +1822,12 @@ impl Timeline {
                image_layers.push(layer);
            }
        }
-        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
+        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let mut merge_iter = FilterIterator::create(
+            MergeIterator::create(&delta_layers, &image_layers, ctx),
+            dense_ks,
+            sparse_ks,
+        )?;
        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
        // Data of the same key.
        let mut accumulated_values = Vec::new();
@@ -2014,10 +1856,8 @@ impl Timeline {
            self.conf,
            self.timeline_id,
            self.tenant_shard_id,
-            Key::MIN,
            lowest_retain_lsn..end_lsn,
            self.get_compaction_target_size(),
-            ctx,
        )
        .await?;

@@ -2124,7 +1964,7 @@ impl Timeline {
        let produced_image_layers = if let Some(writer) = image_layer_writer {
            if !dry_run {
                writer
-                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
                    .await?
            } else {
                let (layers, _) = writer.take()?;
@@ -2137,7 +1977,7 @@ impl Timeline {

        let produced_delta_layers = if !dry_run {
            delta_layer_writer
-                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                .finish_with_discard_fn(self, ctx, discard)
                .await?
        } else {
            let (layers, _) = delta_layer_writer.take()?;
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -135,25 +135,6 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
        .context("delete_all")
 }

-// This function removs remaining traces of a timeline on disk.
-// Namely: metadata file, timeline directory, delete mark.
-// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
-// delete mark should be present because it is the last step during deletion.
-// (nothing can fail after its deletion)
-async fn cleanup_remaining_timeline_fs_traces(
-    conf: &PageServerConf,
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<()> {
-    // Remove delete mark
-    // TODO: once we are confident that no more exist in the field, remove this
-    // line.  It cleans up a legacy marker file that might in rare cases be present.
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove delete mark")
-}
-
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
@@ -194,12 +175,10 @@ async fn remove_timeline_from_tenant(
 /// 7. Delete mark file
 ///
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
+/// There are two entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
 ///    and we possibly neeed to continue deletion of remote files.
-/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-///    index but still have local metadata, timeline directory and delete mark.
 ///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
@@ -311,18 +290,6 @@ impl DeleteTimelineFlow {
        Ok(())
    }

-    #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn cleanup_remaining_timeline_fs_traces(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
-        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
-                .await;
-        info!("Done");
-        r
-    }
-
    fn prepare(
        tenant: &Tenant,
        timeline_id: TimelineId,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,8 +30,8 @@ use crate::{
    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
+        size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
+        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
    },
 };

@@ -557,6 +557,8 @@ impl Timeline {
            gather_result = gather => {
                match gather_result {
                    Ok(_) => {},
+                    // It can happen sometimes that we hit this instead of the cancellation token firing above
+                    Err(CalculateSyntheticSizeError::Cancelled) => {}
                    Err(e) => {
                        // We don't care about the result, but, if it failed, we should log it,
                        // since consumption metric might be hitting the cached value and
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -31,7 +31,7 @@ use crate::{
    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
-    walrecord::DecodedWALRecord,
+    walrecord::{decode_wal_record, DecodedWALRecord},
 };
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
@@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection(
                waldecoder.feed_bytes(data);

                {
-                    let mut decoded = DecodedWALRecord::default();
                    let mut modification = timeline.begin_modification(startlsn);
                    let mut uncommitted_records = 0;
                    let mut filtered_records = 0;
+
+                    async fn commit(
+                        modification: &mut DatadirModification<'_>,
+                        uncommitted: &mut u64,
+                        filtered: &mut u64,
+                        ctx: &RequestContext,
+                    ) -> anyhow::Result<()> {
+                        WAL_INGEST
+                            .records_committed
+                            .inc_by(*uncommitted - *filtered);
+                        modification.commit(ctx).await?;
+                        *uncommitted = 0;
+                        *filtered = 0;
+                        Ok(())
+                    }
+
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
@@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection(
                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                        }

+                        // Deserialize WAL record
+                        let mut decoded = DecodedWALRecord::default();
+                        decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
+
+                        if decoded.is_dbase_create_copy(timeline.pg_version)
+                            && uncommitted_records > 0
+                        {
+                            // Special case: legacy PG database creations operate by reading pages from a 'template' database:
+                            // these are the only kinds of WAL record that require reading data blocks while ingesting.  Ensure
+                            // all earlier writes of data blocks are visible by committing any modification in flight.
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
+                        }
+
                        // Ingest the records without immediately committing them.
                        let ingested = walingest
-                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                            .ingest_record(decoded, lsn, &mut modification, &ctx)
                            .await
                            .with_context(|| format!("could not ingest record at {lsn}"))?;
                        if !ingested {
@@ -349,21 +383,25 @@ pub(super) async fn handle_walreceiver_connection(
                            || modification.approx_pending_bytes()
                                > DatadirModification::MAX_PENDING_BYTES
                        {
-                            WAL_INGEST
-                                .records_committed
-                                .inc_by(uncommitted_records - filtered_records);
-                            modification.commit(&ctx).await?;
-                            uncommitted_records = 0;
-                            filtered_records = 0;
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
                        }
                    }

                    // Commit the remaining records.
                    if uncommitted_records > 0 {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(uncommitted_records - filtered_records);
-                        modification.commit(&ctx).await?;
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
                    }
                }

--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -16,9 +16,9 @@
 //! Note that the vectored blob api does *not* go through the page cache.

 use std::collections::BTreeMap;
-use std::num::NonZeroUsize;
+use std::ops::Deref;

-use bytes::BytesMut;
+use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -29,9 +29,6 @@ use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::{self, VirtualFile};

-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub struct MaxVectoredReadBytes(pub NonZeroUsize);
-
 /// Metadata bundled with the start and end offset of a blob.
 #[derive(Copy, Clone, Debug)]
 pub struct BlobMeta {
@@ -39,11 +36,123 @@ pub struct BlobMeta {
    pub lsn: Lsn,
 }

-/// Blob offsets into [`VectoredBlobsBuf::buf`]
+/// A view into the vectored blobs read buffer.
+#[derive(Clone, Debug)]
+pub(crate) enum BufView<'a> {
+    Slice(&'a [u8]),
+    Bytes(bytes::Bytes),
+}
+
+impl<'a> BufView<'a> {
+    /// Creates a new slice-based view on the blob.
+    pub fn new_slice(slice: &'a [u8]) -> Self {
+        Self::Slice(slice)
+    }
+
+    /// Creates a new [`bytes::Bytes`]-based view on the blob.
+    pub fn new_bytes(bytes: bytes::Bytes) -> Self {
+        Self::Bytes(bytes)
+    }
+
+    /// Convert the view into `Bytes`.
+    ///
+    /// If using slice as the underlying storage, the copy will be an O(n) operation.
+    pub fn into_bytes(self) -> Bytes {
+        match self {
+            BufView::Slice(slice) => Bytes::copy_from_slice(slice),
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+
+    /// Creates a sub-view of the blob based on the range.
+    fn view(&self, range: std::ops::Range<usize>) -> Self {
+        match self {
+            BufView::Slice(slice) => BufView::Slice(&slice[range]),
+            BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)),
+        }
+    }
+}
+
+impl<'a> Deref for BufView<'a> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+}
+
+impl<'a> AsRef<[u8]> for BufView<'a> {
+    fn as_ref(&self) -> &[u8] {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes.as_ref(),
+        }
+    }
+}
+
+impl<'a> From<&'a [u8]> for BufView<'a> {
+    fn from(value: &'a [u8]) -> Self {
+        Self::new_slice(value)
+    }
+}
+
+impl From<Bytes> for BufView<'_> {
+    fn from(value: Bytes) -> Self {
+        Self::new_bytes(value)
+    }
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed,
+/// subject to [`VectoredBlob::compression_bits`].
 pub struct VectoredBlob {
-    pub start: usize,
-    pub end: usize,
+    /// Blob metadata.
    pub meta: BlobMeta,
+    /// Start offset.
+    start: usize,
+    /// End offset.
+    end: usize,
+    /// Compression used on the the blob.
+    compression_bits: u8,
+}
+
+impl VectoredBlob {
+    /// Reads a decompressed view of the blob.
+    pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
+        let view = buf.view(self.start..self.end);
+
+        match self.compression_bits {
+            BYTE_UNCOMPRESSED => Ok(view),
+            BYTE_ZSTD => {
+                let mut decompressed_vec = Vec::new();
+                let mut decoder =
+                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
+                decoder.write_all(&view).await?;
+                decoder.flush().await?;
+                // Zero-copy conversion from `Vec` to `Bytes`
+                Ok(BufView::new_bytes(Bytes::from(decompressed_vec)))
+            }
+            bits => {
+                let error = std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
+                );
+                Err(error)
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for VectoredBlob {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}@{}, {}..{}",
+            self.meta.key, self.meta.lsn, self.start, self.end
+        )
+    }
 }

 /// Return type of [`VectoredBlobReader::read_blobs`]
@@ -518,7 +627,7 @@ impl<'a> VectoredBlobReader<'a> {
            );
        }

-        let mut buf = self
+        let buf = self
            .file
            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
            .await?
@@ -533,9 +642,6 @@ impl<'a> VectoredBlobReader<'a> {
        // of a blob is implicit: the start of the next blob if one exists
        // or the end of the read.

-        // Some scratch space, put here for reusing the allocation
-        let mut decompressed_vec = Vec::new();
-
        for (blob_start, meta) in blobs_at {
            let blob_start_in_buf = blob_start - start_offset;
            let first_len_byte = buf[blob_start_in_buf as usize];
@@ -561,35 +667,14 @@ impl<'a> VectoredBlobReader<'a> {
                )
            };

-            let start_raw = blob_start_in_buf + size_length;
-            let end_raw = start_raw + blob_size;
-            let (start, end);
-            if compression_bits == BYTE_UNCOMPRESSED {
-                start = start_raw as usize;
-                end = end_raw as usize;
-            } else if compression_bits == BYTE_ZSTD {
-                let mut decoder =
-                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
-                decoder
-                    .write_all(&buf[start_raw as usize..end_raw as usize])
-                    .await?;
-                decoder.flush().await?;
-                start = buf.len();
-                buf.extend_from_slice(&decompressed_vec);
-                end = buf.len();
-                decompressed_vec.clear();
-            } else {
-                let error = std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    format!("invalid compression byte {compression_bits:x}"),
-                );
-                return Err(error);
-            }
+            let start = (blob_start_in_buf + size_length) as usize;
+            let end = start + blob_size as usize;

            metas.push(VectoredBlob {
                start,
                end,
                meta: *meta,
+                compression_bits,
            });
        }

@@ -597,8 +682,10 @@ impl<'a> VectoredBlobReader<'a> {
    }
 }

-/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+///
+/// It provides a streaming API for getting read blobs. It returns a batch when
+/// `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
    read_builder: Option<VectoredReadBuilder>,
@@ -1022,8 +1109,13 @@ mod tests {
            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
            assert_eq!(result.blobs.len(), 1);
            let read_blob = &result.blobs[0];
-            let read_buf = &result.buf[read_blob.start..read_blob.end];
-            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
+            let view = BufView::new_slice(&result.buf);
+            let read_buf = read_blob.read(&view).await?;
+            assert_eq!(
+                &blob[..],
+                &read_buf[..],
+                "mismatch for idx={idx} at offset={offset}"
+            );
            buf = result.buf;
        }
        Ok(())
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,6 +1,7 @@
-//!
 //! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor. Instead, the file is opened when it's read from,
+//! a file descriptor.
+//!
+//! Instead, the file is opened when it's read from,
 //! and if too many files are open globally in the system, least-recently
 //! used ones are closed.
 //!
@@ -10,7 +11,6 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};

@@ -19,6 +19,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::io_buf_ext::FullSlice;
+use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine {
                        }
                    },
                    Err(std::env::VarError::NotPresent) => {
-                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                            .parse()
-                            .unwrap()
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoEngineKind::TokioEpollUring
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoEngineKind::StdFs
+                        }
                    }
                    Err(std::env::VarError::NotUnicode(_)) => {
                        panic!("env var {env_var_name} is not unicode");
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,9 +25,7 @@ use std::time::Duration;
 use std::time::SystemTime;

 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
-use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
-use postgres_ffi::TimestampTz;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

 use anyhow::{bail, Context, Result};
@@ -48,13 +46,29 @@ use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
-use postgres_ffi::v14::xlog_utils::*;
-use postgres_ffi::v14::CheckPoint;
 use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
+use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;

+enum_pgversion! {CheckPoint, pgv::CheckPoint}
+
+impl CheckPoint {
+    fn encode(&self) -> Result<Bytes, SerializeError> {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() })
+    }
+
+    fn update_next_xid(&mut self, xid: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) })
+    }
+
+    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, {
+            cp.update_next_multixid(multi_xid, multi_offset)
+        })
+    }
+}
+
 pub struct WalIngest {
    shard: ShardIdentity,
    checkpoint: CheckPoint,
@@ -77,8 +91,13 @@ impl WalIngest {
        // Fetch the latest checkpoint into memory, so that we can compare with it
        // quickly in `ingest_record` and update it when it changes.
        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
-        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-        trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+        let pgversion = timeline.pg_version;
+
+        let checkpoint = dispatch_pgversion!(pgversion, {
+            let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+            trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+            <pgv::CheckPoint as Into<CheckPoint>>::into(checkpoint)
+        });

        Ok(WalIngest {
            shard: *timeline.get_shard_identity(),
@@ -104,10 +123,9 @@ impl WalIngest {
    ///
    pub async fn ingest_record(
        &mut self,
-        recdata: Bytes,
+        decoded: DecodedWALRecord,
        lsn: Lsn,
        modification: &mut DatadirModification<'_>,
-        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<bool> {
        WAL_INGEST.records_received.inc();
@@ -115,7 +133,12 @@ impl WalIngest {
        let prev_len = modification.len();

        modification.set_lsn(lsn)?;
-        decode_wal_record(recdata, decoded, pg_version)?;
+
+        if decoded.is_dbase_create_copy(pg_version) {
+            // Records of this type should always be preceded by a commit(), as they
+            // rely on reading data pages back from the Timeline.
+            assert!(!modification.has_dirty_data_pages());
+        }

        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);
@@ -133,11 +156,11 @@ impl WalIngest {
            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                // Heap AM records need some special handling, because they modify VM pages
                // without registering them with the standard mechanism.
-                self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
+                self.ingest_heapam_record(&mut buf, modification, &decoded, ctx)
                    .await?;
            }
            pg_constants::RM_NEON_ID => {
-                self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
+                self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx)
                    .await?;
            }
            // Handle other special record types
@@ -214,6 +237,26 @@ impl WalIngest {
                                .await?;
                        }
                    }
+                } else if pg_version == 17 {
+                    if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                        // The XLOG record was renamed between v14 and v15,
+                        // but the record format is the same.
+                        // So we can reuse XlCreateDatabase here.
+                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
+                        let createdb = XlCreateDatabase::decode(&mut buf);
+                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                            .await?;
+                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                        let dropdb = XlDropDatabase::decode(&mut buf);
+                        for tablespace_id in dropdb.tablespace_ids {
+                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
+                            modification
+                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                                .await?;
+                        }
+                    }
                }
            }
            pg_constants::RM_TBLSPC_ID => {
@@ -223,7 +266,11 @@ impl WalIngest {
                let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;

                if info == pg_constants::CLOG_ZEROPAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                    self.put_slru_page_image(
@@ -237,7 +284,7 @@ impl WalIngest {
                    .await?;
                } else {
                    assert!(info == pg_constants::CLOG_TRUNCATE);
-                    let xlrec = XlClogTruncate::decode(&mut buf);
+                    let xlrec = XlClogTruncate::decode(&mut buf, pg_version);
                    self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                        .await?;
                }
@@ -276,12 +323,21 @@ impl WalIngest {
                        parsed_xact.xid,
                        lsn,
                    );
-                    modification
-                        .drop_twophase_file(parsed_xact.xid, ctx)
-                        .await?;
+
+                    let xid: u64 = if pg_version >= 17 {
+                        self.adjust_to_full_transaction_id(parsed_xact.xid)?
+                    } else {
+                        parsed_xact.xid as u64
+                    };
+                    modification.drop_twophase_file(xid, ctx).await?;
                } else if info == pg_constants::XLOG_XACT_PREPARE {
+                    let xid: u64 = if pg_version >= 17 {
+                        self.adjust_to_full_transaction_id(decoded.xl_xid)?
+                    } else {
+                        decoded.xl_xid as u64
+                    };
                    modification
-                        .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
+                        .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx)
                        .await?;
                }
            }
@@ -289,7 +345,11 @@ impl WalIngest {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

                if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                    self.put_slru_page_image(
@@ -302,7 +362,11 @@ impl WalIngest {
                    )
                    .await?;
                } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                    self.put_slru_page_image(
@@ -325,76 +389,99 @@ impl WalIngest {
            }
            pg_constants::RM_RELMAP_ID => {
                let xlrec = XlRelmapUpdate::decode(&mut buf);
-                self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
+                self.ingest_relmap_page(modification, &xlrec, &decoded, ctx)
                    .await?;
            }
            pg_constants::RM_XLOG_ID => {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

-                if info == pg_constants::XLOG_NEXTOID {
-                    let next_oid = buf.get_u32_le();
-                    if self.checkpoint.nextOid != next_oid {
-                        self.checkpoint.nextOid = next_oid;
+                if info == pg_constants::XLOG_PARAMETER_CHANGE {
+                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                        let rec = v17::XlParameterChange::decode(&mut buf);
+                        cp.wal_level = rec.wal_level;
                        self.checkpoint_modified = true;
                    }
-                } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-                    || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                {
-                    let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
-                    buf.copy_to_slice(&mut checkpoint_bytes);
-                    let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-                    trace!(
-                        "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                        xlog_checkpoint.oldestXid,
-                        self.checkpoint.oldestXid
-                    );
-                    if (self
-                        .checkpoint
-                        .oldestXid
-                        .wrapping_sub(xlog_checkpoint.oldestXid) as i32)
-                        < 0
-                    {
-                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
+                } else if info == pg_constants::XLOG_END_OF_RECOVERY {
+                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                        let rec = v17::XlEndOfRecovery::decode(&mut buf);
+                        cp.wal_level = rec.wal_level;
+                        self.checkpoint_modified = true;
                    }
-                    trace!(
-                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
-                        xlog_checkpoint.oldestActiveXid,
-                        self.checkpoint.oldestActiveXid
-                    );
-
-                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
-                    // because at shutdown, all in-progress transactions will implicitly
-                    // end. Postgres startup code knows that, and allows hot standby to start
-                    // immediately from a shutdown checkpoint.
-                    //
-                    // In Neon, Postgres hot standby startup always behaves as if starting from
-                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
-                    // instead of overwriting self.checkpoint.oldestActiveXid with
-                    // InvalidTransactionid from the checkpoint WAL record, update it to a
-                    // proper value, knowing that there are no in-progress transactions at this
-                    // point, except for prepared transactions.
-                    //
-                    // See also the neon code changes in the InitWalRecovery() function.
-                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
-                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                    {
-                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
-                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                oldest_active_xid = xid;
-                            }
-                        }
-                        self.checkpoint.oldestActiveXid = oldest_active_xid;
-                    } else {
-                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
-                    }
-
-                    // Write a new checkpoint key-value pair on every checkpoint record, even
-                    // if nothing really changed. Not strictly required, but it seems nice to
-                    // have some trace of the checkpoint records in the layer files at the same
-                    // LSNs.
-                    self.checkpoint_modified = true;
                }
+
+                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                    if info == pg_constants::XLOG_NEXTOID {
+                        let next_oid = buf.get_u32_le();
+                        if cp.nextOid != next_oid {
+                            cp.nextOid = next_oid;
+                            self.checkpoint_modified = true;
+                        }
+                    } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+                        || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
+                        buf.copy_to_slice(&mut checkpoint_bytes);
+                        let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+                        trace!(
+                            "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                            xlog_checkpoint.oldestXid,
+                            cp.oldestXid
+                        );
+                        if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
+                            cp.oldestXid = xlog_checkpoint.oldestXid;
+                        }
+                        trace!(
+                            "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                            xlog_checkpoint.oldestActiveXid,
+                            cp.oldestActiveXid
+                        );
+
+                        // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                        // because at shutdown, all in-progress transactions will implicitly
+                        // end. Postgres startup code knows that, and allows hot standby to start
+                        // immediately from a shutdown checkpoint.
+                        //
+                        // In Neon, Postgres hot standby startup always behaves as if starting from
+                        // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                        // instead of overwriting self.checkpoint.oldestActiveXid with
+                        // InvalidTransactionid from the checkpoint WAL record, update it to a
+                        // proper value, knowing that there are no in-progress transactions at this
+                        // point, except for prepared transactions.
+                        //
+                        // See also the neon code changes in the InitWalRecovery() function.
+                        if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                            && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                        {
+                            let oldest_active_xid = if pg_version >= 17 {
+                                let mut oldest_active_full_xid = cp.nextXid.value;
+                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                    if xid < oldest_active_full_xid {
+                                        oldest_active_full_xid = xid;
+                                    }
+                                }
+                                oldest_active_full_xid as u32
+                            } else {
+                                let mut oldest_active_xid = cp.nextXid.value as u32;
+                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                    let narrow_xid = xid as u32;
+                                    if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                        oldest_active_xid = narrow_xid;
+                                    }
+                                }
+                                oldest_active_xid
+                            };
+                            cp.oldestActiveXid = oldest_active_xid;
+                        } else {
+                            cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                        }
+
+                        // Write a new checkpoint key-value pair on every checkpoint record, even
+                        // if nothing really changed. Not strictly required, but it seems nice to
+                        // have some trace of the checkpoint records in the layer files at the same
+                        // LSNs.
+                        self.checkpoint_modified = true;
+                    }
+                });
            }
            pg_constants::RM_LOGICALMSG_ID => {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -418,7 +505,11 @@ impl WalIngest {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
                if info == pg_constants::XLOG_RUNNING_XACTS {
                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
-                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+
+                    enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                        cp.oldestActiveXid = xlrec.oldest_running_xid;
+                    });
+
                    self.checkpoint_modified = true;
                }
            }
@@ -470,7 +561,7 @@ impl WalIngest {

                continue;
            }
-            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
+            self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx)
                .await?;
        }

@@ -486,9 +577,30 @@ impl WalIngest {
        // until commit() is called to flush the data into the repository and update
        // the latest LSN.

+        modification.on_record_end();
+
        Ok(modification.len() > prev_len)
    }

+    /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+        let next_full_xid =
+            enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
+
+        let next_xid = (next_full_xid) as u32;
+        let mut epoch = (next_full_xid >> 32) as u32;
+
+        if xid > next_xid {
+            // Wraparound occurred, must be from a prev epoch.
+            if epoch == 0 {
+                bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}");
+            }
+            epoch -= 1;
+        }
+
+        Ok((epoch as u64) << 32 | xid as u64)
+    }
+
    /// Do not store this block, but observe it for the purposes of updating our relation size state.
    async fn observe_decoded_block(
        &mut self,
@@ -531,7 +643,7 @@ impl WalIngest {
            && blk.has_image
            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
            && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
            // compression of WAL is not yet supported: fall back to storing the original WAL record
            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
            // do not materialize null pages because them most likely be soon replaced with real data
@@ -557,6 +669,7 @@ impl WalIngest {
                page_set_lsn(&mut image, lsn)
            }
            assert_eq!(image.len(), BLCKSZ as usize);
+
            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                .await?;
        } else {
@@ -788,6 +901,73 @@ impl WalIngest {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
+            17 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v17::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v17::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v17::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v17::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
            _ => {}
        }

@@ -896,26 +1076,26 @@ impl WalIngest {
        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

        match pg_version {
-            16 => {
+            16 | 17 => {
                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

                match info {
                    pg_constants::XLOG_NEON_HEAP_INSERT => {
-                        let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
                        assert_eq!(0, buf.remaining());
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_DELETE => {
-                        let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_UPDATE
                    | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
-                        let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
                        // the size of tuple data is inferred from the size of the record.
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
@@ -931,7 +1111,7 @@ impl WalIngest {
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
-                        let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);

                        let offset_array_len =
                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
@@ -947,7 +1127,7 @@ impl WalIngest {
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
                            old_heap_blkno = Some(decoded.blocks[0].blkno);
                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
@@ -1195,7 +1375,7 @@ impl WalIngest {
            if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                // Tail of last remaining FSM page has to be zeroed.
                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                fsm_physical_page_no += 1;
            }
            let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1217,7 +1397,7 @@ impl WalIngest {
            if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                // Tail of last remaining vm page has to be zeroed.
                // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, vm_page_no)?;
                vm_page_no += 1;
            }
            let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1233,12 +1413,17 @@ impl WalIngest {
    fn warn_on_ingest_lag(
        &mut self,
        conf: &crate::config::PageServerConf,
-        wal_timestmap: TimestampTz,
+        wal_timestamp: TimestampTz,
    ) {
        debug_assert_current_span_has_tenant_and_timeline_id();
        let now = SystemTime::now();
        let rate_limits = &mut self.warn_ingest_lag;
-        match try_from_pg_timestamp(wal_timestmap) {
+
+        let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, {
+            pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp)
+        });
+
+        match ts {
            Ok(ts) => {
                match now.duration_since(ts) {
                    Ok(lag) => {
@@ -1248,7 +1433,7 @@ impl WalIngest {
                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                            })
                        }
-                    },
+                    }
                    Err(e) => {
                        let delta_t = e.duration();
                        // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
@@ -1262,7 +1447,6 @@ impl WalIngest {
                        }
                    }
                };
-
            }
            Err(error) => {
                rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
@@ -1370,14 +1554,17 @@ impl WalIngest {
        // truncated, but a checkpoint record with the updated values isn't written until
        // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
        // so we keep the oldestXid and oldestXidDB up-to-date.
-        self.checkpoint.oldestXid = xlrec.oldest_xid;
-        self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
+        enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+            cp.oldestXid = xlrec.oldest_xid;
+            cp.oldestXidDB = xlrec.oldest_xid_db;
+        });
        self.checkpoint_modified = true;

        // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it

        let latest_page_number =
-            self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+            enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32
+                / pg_constants::CLOG_XACTS_PER_PAGE;

        // Now delete all segments containing pages between xlrec.pageno
        // and latest_page_number.
@@ -1385,7 +1572,9 @@ impl WalIngest {
        // First, make an important safety check:
        // the current endpoint page must not be eligible for removal.
        // See SimpleLruTruncate() in slru.c
-        if clogpage_precedes(latest_page_number, xlrec.pageno) {
+        if dispatch_pgversion!(modification.tline.pg_version, {
+            pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno)
+        }) {
            info!("could not truncate directory pg_xact apparent wraparound");
            return Ok(());
        }
@@ -1402,7 +1591,12 @@ impl WalIngest {
            .await?
        {
            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
-            if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
+
+            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno)
+            });
+
+            if may_delete {
                modification
                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
                    .await?;
@@ -1521,14 +1715,23 @@ impl WalIngest {
        xlrec: &XlMultiXactTruncate,
        ctx: &RequestContext,
    ) -> Result<()> {
-        self.checkpoint.oldestMulti = xlrec.end_trunc_off;
-        self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
+        let (maxsegment, startsegment, endsegment) =
+            enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                cp.oldestMulti = xlrec.end_trunc_off;
+                cp.oldestMultiDB = xlrec.oldest_multi_db;
+                let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(
+                    pg_constants::MAX_MULTIXACT_OFFSET,
+                );
+                let startsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb);
+                let endsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb);
+                (maxsegment, startsegment, endsegment)
+            });
+
        self.checkpoint_modified = true;

        // PerformMembersTruncation
-        let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
-        let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb);
-        let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb);
        let mut segment: i32 = startsegment;

        // Delete all the segments except the last one. The last segment can still
@@ -1687,7 +1890,7 @@ impl WalIngest {
                    continue;
                }

-                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, gap_blknum)?;
            }
        }
        Ok(())
@@ -1753,7 +1956,7 @@ impl WalIngest {

            // fill the gap with zeros
            for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum)?;
            }
        }
        Ok(())
@@ -1802,11 +2005,23 @@ mod tests {
        // TODO
    }

-    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
+    #[tokio::test]
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+        for i in 14..=16 {
+            dispatch_pgversion!(i, {
+                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
+            });
+        }
+
+        Ok(())
+    }

    async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
        let mut m = tline.begin_modification(Lsn(0x10));
-        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
+        m.put_checkpoint(dispatch_pgversion!(
+            tline.pg_version,
+            pgv::ZERO_CHECKPOINT.clone()
+        ))?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
        m.commit(ctx).await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
@@ -1827,21 +2042,25 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
            .await?;
+        m.on_record_end();
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
            .await?;
+        m.on_record_end();
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
            .await?;
+        m.on_record_end();
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
            .await?;
+        m.on_record_end();
        m.commit(&ctx).await?;

        assert_current_logical_size(&tline, Lsn(0x50));
@@ -1983,6 +2202,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
            .await?;
+        m.on_record_end();
        m.commit(&ctx).await?;
        assert_eq!(
            tline
@@ -2008,6 +2228,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
            .await?;
+        m.on_record_end();
        m.commit(&ctx).await?;
        assert_eq!(
            tline
@@ -2409,7 +2630,6 @@ mod tests {
            .await
            .unwrap();
        let mut modification = tline.begin_modification(startpoint);
-        let mut decoded = DecodedWALRecord::default();
        println!("decoding {} bytes", bytes.len() - xlogoff);

        // Decode and ingest wal. We process the wal in chunks because
@@ -2417,8 +2637,10 @@ mod tests {
        for chunk in bytes[xlogoff..].chunks(50) {
            decoder.feed_bytes(chunk);
            while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap();
                walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                    .ingest_record(decoded, lsn, &mut modification, &ctx)
                    .instrument(span.clone())
                    .await
                    .unwrap();
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -160,6 +160,31 @@ pub struct DecodedWALRecord {
    pub origin_id: u16,
 }

+impl DecodedWALRecord {
+    /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
+    /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
+    /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
+    pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
+        if self.xl_rmid == pg_constants::RM_DBASE_ID {
+            let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            match pg_version {
+                14 => {
+                    // Postgres 14 database creations are always the legacy kind
+                    info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
+                }
+                15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                _ => {
+                    panic!("Unsupported postgres version {pg_version}")
+                }
+            }
+        } else {
+            false
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
 pub struct RelFileNode {
@@ -317,16 +342,47 @@ pub mod v14 {
            }
        }
    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlParameterChange {
+        pub max_connections: i32,
+        pub max_worker_processes: i32,
+        pub max_wal_senders: i32,
+        pub max_prepared_xacts: i32,
+        pub max_locks_per_xact: i32,
+        pub wal_level: i32,
+        pub wal_log_hints: bool,
+        pub track_commit_timestamp: bool,
+        pub _padding: [u8; 2],
+    }
+
+    impl XlParameterChange {
+        pub fn decode(buf: &mut Bytes) -> XlParameterChange {
+            XlParameterChange {
+                max_connections: buf.get_i32_le(),
+                max_worker_processes: buf.get_i32_le(),
+                max_wal_senders: buf.get_i32_le(),
+                max_prepared_xacts: buf.get_i32_le(),
+                max_locks_per_xact: buf.get_i32_le(),
+                wal_level: buf.get_i32_le(),
+                wal_log_hints: buf.get_u8() != 0,
+                track_commit_timestamp: buf.get_u8() != 0,
+                _padding: [buf.get_u8(), buf.get_u8()],
+            }
+        }
+    }
 }

 pub mod v15 {
    pub use super::v14::{
        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
+        XlParameterChange,
    };
 }

 pub mod v16 {
-    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
+    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange};
    use bytes::{Buf, Bytes};
    use postgres_ffi::{OffsetNumber, TransactionId};

@@ -505,6 +561,37 @@ pub mod v16 {
    }
 }

+pub mod v17 {
+    pub use super::v14::XlHeapLockUpdated;
+    use bytes::{Buf, Bytes};
+    pub use postgres_ffi::{TimeLineID, TimestampTz};
+
+    pub use super::v16::rm_neon;
+    pub use super::v16::{
+        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
+    };
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlEndOfRecovery {
+        pub end_time: TimestampTz,
+        pub this_time_line_id: TimeLineID,
+        pub prev_time_line_id: TimeLineID,
+        pub wal_level: i32,
+    }
+
+    impl XlEndOfRecovery {
+        pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery {
+            XlEndOfRecovery {
+                end_time: buf.get_i64_le(),
+                this_time_line_id: buf.get_u32_le(),
+                prev_time_line_id: buf.get_u32_le(),
+                wal_level: buf.get_i32_le(),
+            }
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrCreate {
@@ -722,9 +809,13 @@ pub struct XlClogTruncate {
 }

 impl XlClogTruncate {
-    pub fn decode(buf: &mut Bytes) -> XlClogTruncate {
+    pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate {
        XlClogTruncate {
-            pageno: buf.get_u32_le(),
+            pageno: if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            },
            oldest_xid: buf.get_u32_le(),
            oldest_xid_db: buf.get_u32_le(),
        }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -35,6 +35,7 @@ use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
+use std::future::Future;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
@@ -43,13 +44,12 @@ use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;

+/// The real implementation that uses a Postgres process to
+/// perform WAL replay.
 ///
-/// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. Only one thread can use the process at a time,
-/// that is controlled by the Mutex. In the future, we might want to
-/// launch a pool of processes to allow concurrent replay of multiple
-/// records.
-///
+/// Only one thread can use the process at a time, that is controlled by the
+/// Mutex. In the future, we might want to launch a pool of processes to allow
+/// concurrent replay of multiple records.
 pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
@@ -205,6 +205,22 @@ impl PostgresRedoManager {
        }
    }

+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
+        self.do_with_walredo_process(pg_version, |proc| async move {
+            proc.ping(Duration::from_secs(1))
+                .await
+                .map_err(Error::Other)
+        })
+        .await
+    }
+
    pub fn status(&self) -> WalRedoManagerStatus {
        WalRedoManagerStatus {
            last_redo_at: {
@@ -297,6 +313,100 @@ impl PostgresRedoManager {
        }
    }

+    /// # Cancel-Safety
+    ///
+    /// This method is cancel-safe iff `closure` is cancel-safe.
+    async fn do_with_walredo_process<
+        F: FnOnce(Arc<Process>) -> Fut,
+        Fut: Future<Output = Result<O, Error>>,
+        O,
+    >(
+        &self,
+        pg_version: u32,
+        closure: F,
+    ) -> Result<O, Error> {
+        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+            Ok(guard) => match &*guard {
+                ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                ProcessOnceCell::ManagerShutDown => {
+                    return Err(Error::Cancelled);
+                }
+            },
+            Err(permit) => {
+                let start = Instant::now();
+                // acquire guard before spawning process, so that we don't spawn new processes
+                // if the gate is already closed.
+                let _launched_processes_guard = match self.launched_processes.enter() {
+                    Ok(guard) => guard,
+                    Err(GateError::GateClosed) => unreachable!(
+                        "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                    ),
+                };
+                let proc = Arc::new(Process {
+                    process: process::WalRedoProcess::launch(
+                        self.conf,
+                        self.tenant_shard_id,
+                        pg_version,
+                    )
+                    .context("launch walredo process")?,
+                    _launched_processes_guard,
+                });
+                let duration = start.elapsed();
+                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                info!(
+                    elapsed_ms = duration.as_millis(),
+                    pid = proc.id(),
+                    "launched walredo process"
+                );
+                self.redo_process
+                    .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                proc
+            }
+        };
+
+        // async closures are unstable, would support &Process
+        let result = closure(proc.clone()).await;
+
+        if result.is_err() {
+            // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+            // Note that there may be other tasks concurrent with us that also hold `proc`.
+            // We have to deal with that here.
+            // Also read the doc comment on field `self.redo_process`.
+            //
+            // NB: there may still be other concurrent threads using `proc`.
+            // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
+            //
+            // NB: the drop impl blocks the dropping thread with a wait() system call for
+            // the child process. In some ways the blocking is actually good: if we
+            // deferred the waiting into the background / to tokio if we used `tokio::process`,
+            // it could happen that if walredo always fails immediately, we spawn processes faster
+            // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
+            // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
+            // This probably needs revisiting at some later point.
+            match self.redo_process.get() {
+                None => (),
+                Some(guard) => {
+                    match &*guard {
+                        ProcessOnceCell::ManagerShutDown => {}
+                        ProcessOnceCell::Spawned(guard_proc) => {
+                            if Arc::ptr_eq(&proc, guard_proc) {
+                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                guard.take_and_deinit();
+                            } else {
+                                // Another task already spawned another redo process (further up in this method)
+                                // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                            }
+                        }
+                    }
+                }
+            }
+            // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
+            drop(proc);
+        }
+
+        result
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -320,130 +430,63 @@ impl PostgresRedoManager {
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
-                Ok(guard) => match &*guard {
-                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
-                    ProcessOnceCell::ManagerShutDown => {
-                        return Err(Error::Cancelled);
-                    }
-                },
-                Err(permit) => {
-                    let start = Instant::now();
-                    // acquire guard before spawning process, so that we don't spawn new processes
-                    // if the gate is already closed.
-                    let _launched_processes_guard = match self.launched_processes.enter() {
-                                Ok(guard) => guard,
-                                Err(GateError::GateClosed) => unreachable!(
-                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
-                                ),
-                            };
-                    let proc = Arc::new(Process {
-                        process: process::WalRedoProcess::launch(
-                            self.conf,
-                            self.tenant_shard_id,
-                            pg_version,
-                        )
-                        .context("launch walredo process")?,
-                        _launched_processes_guard,
-                    });
-                    let duration = start.elapsed();
-                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                    info!(
-                        duration_ms = duration.as_millis(),
-                        pid = proc.id(),
-                        "launched walredo process"
-                    );
-                    self.redo_process
-                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
-                    proc
-                }
-            };
+            let base_img = &base_img;
+            let closure = |proc: Arc<Process>| async move {
+                let started_at = std::time::Instant::now();

-            let started_at = std::time::Instant::now();
+                // Relational WAL records are applied using wal-redo-postgres
+                let result = proc
+                    .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
+                    .context("apply_wal_records");

-            // Relational WAL records are applied using wal-redo-postgres
-            let result = proc
-                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
-                .await
-                .context("apply_wal_records");
+                let duration = started_at.elapsed();

-            let duration = started_at.elapsed();
-
-            let len = records.len();
-            let nbytes = records.iter().fold(0, |acumulator, record| {
-                acumulator
-                    + match &record.1 {
-                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                    }
-            });
-
-            WAL_REDO_TIME.observe(duration.as_secs_f64());
-            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
-
-            debug!(
-                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-                len,
-                nbytes,
-                duration.as_micros(),
-                lsn
-            );
-
-            // If something went wrong, don't try to reuse the process. Kill it, and
-            // next request will launch a new one.
-            if let Err(e) = result.as_ref() {
-                error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                    records.len(),
-                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                    nbytes,
-                    base_img_lsn,
-                    lsn,
-                    n_attempts,
-                    e,
-                );
-                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
-                // Note that there may be other tasks concurrent with us that also hold `proc`.
-                // We have to deal with that here.
-                // Also read the doc comment on field `self.redo_process`.
-                //
-                // NB: there may still be other concurrent threads using `proc`.
-                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                //
-                // NB: the drop impl blocks the dropping thread with a wait() system call for
-                // the child process. In some ways the blocking is actually good: if we
-                // deferred the waiting into the background / to tokio if we used `tokio::process`,
-                // it could happen that if walredo always fails immediately, we spawn processes faster
-                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
-                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
-                // This probably needs revisiting at some later point.
-                match self.redo_process.get() {
-                    None => (),
-                    Some(guard) => {
-                        match &*guard {
-                            ProcessOnceCell::ManagerShutDown => {}
-                            ProcessOnceCell::Spawned(guard_proc) => {
-                                if Arc::ptr_eq(&proc, guard_proc) {
-                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                    guard.take_and_deinit();
-                                } else {
-                                    // Another task already spawned another redo process (further up in this method)
-                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
-                                }
-                            }
+                let len = records.len();
+                let nbytes = records.iter().fold(0, |acumulator, record| {
+                    acumulator
+                        + match &record.1 {
+                            NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                            _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
                        }
-                    }
+                });
+
+                WAL_REDO_TIME.observe(duration.as_secs_f64());
+                WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+                WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+
+                debug!(
+                    "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+                    len,
+                    nbytes,
+                    duration.as_micros(),
+                    lsn
+                );
+
+                if let Err(e) = result.as_ref() {
+                    error!(
+                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                        records.len(),
+                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                        nbytes,
+                        base_img_lsn,
+                        lsn,
+                        n_attempts,
+                        e,
+                    );
                }
-                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
-                drop(proc);
-            } else if n_attempts != 0 {
+
+                result.map_err(Error::Other)
+            };
+            let result = self.do_with_walredo_process(pg_version, closure).await;
+
+            if result.is_ok() && n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
            n_attempts += 1;
            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result.map_err(Error::Other);
+                return result;
            }
        }
    }
@@ -513,6 +556,17 @@ mod tests {
    use tracing::Instrument;
    use utils::{id::TenantId, lsn::Lsn};

+    #[tokio::test]
+    async fn test_ping() {
+        let h = RedoHarness::new().unwrap();
+
+        h.manager
+            .ping(14)
+            .instrument(h.span())
+            .await
+            .expect("ping should work");
+    }
+
    #[tokio::test]
    async fn short_v14_redo() {
        let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -6,6 +6,7 @@ use self::no_leak_child::NoLeakChild;
 use crate::{
    config::PageServerConf,
    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    page_cache::PAGE_SZ,
    span::debug_assert_current_span_has_tenant_id,
    walrecord::NeonWalRecord,
 };
@@ -237,6 +238,26 @@ impl WalRedoProcess {
        res
    }

+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
+        let mut writebuf: Vec<u8> = Vec::with_capacity(4);
+        protocol::build_ping_msg(&mut writebuf);
+        let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo ping timed out");
+        };
+        let response = res?;
+        if response.len() != PAGE_SZ {
+            anyhow::bail!(
+                "WAL redo ping response should respond with page-sized response: {}",
+                response.len()
+            );
+        }
+        Ok(())
+    }
+
    /// # Cancel-Safety
    ///
    /// When not polled to completion (e.g. because in `tokio::select!` another
--- a/pageserver/src/walredo/process/protocol.rs
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -55,3 +55,8 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
    tag.ser_into(buf)
        .expect("serialize BufferTag should always succeed");
 }
+
+pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
+    buf.put_u8(b'H');
+    buf.put_u32(4);
+}