dirty

utils: use SmallVec in VecMap
update ingest_bench
2026-05-18 21:50:37 +00:00 · 2024-08-05 18:39:26 +00:00 · 2024-08-05 18:22:41 +00:00 · 2024-08-05 17:57:41 +00:00 · 2024-08-05 17:49:02 +00:00 · 2024-08-05 17:49:02 +00:00
52 changed files with 1619 additions and 1712 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,8 +8,6 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - BENCHMARK_PROJECT_ID_PUB
-  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,7 +147,7 @@ jobs:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -168,7 +168,7 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    - name: Run Logical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -176,15 +176,12 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
-        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}

-    - name: Run Physical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,31 +66,7 @@ jobs:
        ports:
          - 9000:9000
          - 8123:8123
-      zookeeper:
-        image: quay.io/debezium/zookeeper:2.7
-        ports:
-          - 2181:2181
-      kafka:
-        image: quay.io/debezium/kafka:2.7
-        env:
-          ZOOKEEPER_CONNECT: "zookeeper:2181"
-          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
-          KAFKA_BROKER_ID: 1
-          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-          KAFKA_JMX_PORT: 9991
-        ports:
-          - 9092:9092
-      debezium:
-        image: quay.io/debezium/connect:2.7
-        env:
-          BOOTSTRAP_SERVERS: kafka:9092
-          GROUP_ID: 1
-          CONFIG_STORAGE_TOPIC: debezium-config
-          OFFSET_STORAGE_TOPIC: debezium-offset
-          STATUS_STORAGE_TOPIC: debezium-status
-          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
-        ports:
-          - 8083:8083
+
    steps:
      - uses: actions/checkout@v4

--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,6 +10,10 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6762,6 +6762,7 @@ dependencies = [
 "serde_path_to_error",
 "serde_with",
 "signal-hook",
+ "smallvec",
 "strum",
 "strum_macros",
 "thiserror",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -933,8 +933,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -946,7 +945,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -961,7 +960,6 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
            cleanup
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x

-cd /ext-src || exit 2
+cd /ext-src
 FAILED=
-LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d "${d}" ] || continue
+       [ -d ${d} ] || continue
    psql -c "select 1" >/dev/null || break
-       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo "${FAILED}"
+echo ${FAILED}
 exit 1
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,13 +637,6 @@ pub struct TenantInfo {
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
    pub generation: u32,
-
-    /// Opaque explanation if gc is being blocked.
-    ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub gc_blocking: Option<String>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -1434,7 +1427,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1457,7 +1449,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -35,6 +35,7 @@ routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
+smallvec.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,11 +1,15 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

+use smallvec::SmallVec;
+
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum VecMapOrdering {
    Greater,
    GreaterOrEqual,
 }

+const INLINE_ELEMENTS: usize = 1;
+
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
@@ -13,7 +17,7 @@ pub enum VecMapOrdering {
 /// during `VecMap` construction.
 #[derive(Clone, Debug)]
 pub struct VecMap<K, V> {
-    data: Vec<(K, V)>,
+    data: SmallVec<[(K, V); INLINE_ELEMENTS]>,
    ordering: VecMapOrdering,
 }

@@ -37,14 +41,14 @@ pub enum VecMapError {
 impl<K: Ord, V> VecMap<K, V> {
    pub fn new(ordering: VecMapOrdering) -> Self {
        Self {
-            data: Vec::new(),
+            data: Default::default(),
            ordering,
        }
    }

    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
        Self {
-            data: Vec::with_capacity(capacity),
+            data: SmallVec::with_capacity(capacity),
            ordering,
        }
    }
@@ -95,6 +99,10 @@ impl<K: Ord, V> VecMap<K, V> {
        Ok(delta_size)
    }

+    pub fn append_fast(&mut self, key: K, value: V) {
+        self.data.push((key, value))
+    }
+
    /// Update the maximum key value pair or add a new key value pair to the map.
    /// If `key` is not respective of the `self` ordering no updates or additions
    /// will occur and `InvalidKey` error will be returned.
@@ -135,11 +143,11 @@ impl<K: Ord, V> VecMap<K, V> {

        (
            VecMap {
-                data: self.data[..split_idx].to_vec(),
+                data: SmallVec::from(&self.data[..split_idx]),
                ordering: self.ordering,
            },
            VecMap {
-                data: self.data[split_idx..].to_vec(),
+                data: SmallVec::from(&self.data[split_idx..]),
                ordering: self.ordering,
            },
        )
@@ -186,7 +194,10 @@ impl<K: Ord, V> VecMap<K, V> {
    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
-    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
+    fn instrument_vec_op(
+        &mut self,
+        op: impl FnOnce(&mut SmallVec<[(K, V); INLINE_ELEMENTS]>),
+    ) -> usize {
        let old_cap = self.data.capacity();
        op(&mut self.data);
        let new_cap = self.data.capacity();
@@ -226,7 +237,7 @@ impl<K: Ord, V> VecMap<K, V> {

 impl<K: Ord, V> IntoIterator for VecMap<K, V> {
    type Item = (K, V);
-    type IntoIter = std::vec::IntoIter<(K, V)>;
+    type IntoIter = smallvec::IntoIter<[(K, V); INLINE_ELEMENTS]>;

    fn into_iter(self) -> Self::IntoIter {
        self.data.into_iter()
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -108,3 +108,7 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
+
+[[bench]]
+name = "bench_ingest"
+harness = false
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -0,0 +1,250 @@
+use std::{env, num::NonZeroUsize};
+
+use bytes::Bytes;
+use camino::Utf8PathBuf;
+use criterion::{criterion_group, criterion_main, Criterion};
+use pageserver::{
+    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
+    l0_flush::{L0FlushConfig, L0FlushGlobalState},
+    page_cache,
+    repository::Value,
+    task_mgr::TaskKind,
+    tenant::storage_layer::{InMemoryLayer, SerializedBatch},
+    virtual_file::{self, api::IoEngineKind},
+};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use utils::{
+    bin_ser::BeSer,
+    id::{TenantId, TimelineId},
+};
+
+// A very cheap hash for generating non-sequential keys.
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+enum KeyLayout {
+    /// Sequential unique keys
+    Sequential,
+    /// Random unique keys
+    Random,
+    /// Random keys, but only use the bits from the mask of them
+    RandomReuse(u32),
+}
+
+enum WriteDelta {
+    Yes,
+    No,
+}
+
+async fn ingest(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) -> anyhow::Result<()> {
+    let mut lsn = utils::lsn::Lsn(1000);
+    let mut key = Key::from_i128(0x0);
+
+    let timeline_id = TimelineId::generate();
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
+
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
+
+    let value = Value::Image(Bytes::from(vec![0u8; put_size]));
+    let ctx = RequestContext::new(
+        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
+        pageserver::context::DownloadBehavior::Download,
+    );
+
+    let batch_pages = 10000;
+    let mut batch_values = vec![];
+
+    for i in 0..put_count {
+        lsn += put_size as u64;
+
+        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
+        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
+        match key_layout {
+            KeyLayout::Sequential => {
+                // Use sequential order to illustrate the experience a user is likely to have
+                // when ingesting bulk data.
+                key.field6 = i as u32;
+            }
+            KeyLayout::Random => {
+                // Use random-order keys to avoid giving a false advantage to data structures that are
+                // faster when inserting on the end.
+                key.field6 = murmurhash32(i as u32);
+            }
+            KeyLayout::RandomReuse(mask) => {
+                // Use low bits only, to limit cardinality
+                key.field6 = murmurhash32(i as u32) & mask;
+            }
+        }
+
+        batch_values.push((key, lsn, value.clone()));
+
+        if batch_values.len() >= batch_pages {
+            let write_batch = std::mem::take(&mut batch_values);
+            let batch = SerializedBatch::from_values(write_batch);
+
+            layer.put_batch(&batch, &ctx).await?;
+        }
+    }
+    if !batch_values.is_empty() {
+        let batch = SerializedBatch::from_values(vec![(key, lsn, value.clone())]);
+
+        layer.put_batch(&batch, &ctx).await?;
+    }
+    layer.freeze(lsn + 1).await;
+
+    if matches!(write_delta, WriteDelta::Yes) {
+        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
+            max_concurrency: NonZeroUsize::new(1).unwrap(),
+        });
+        let (_desc, path) = layer
+            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .await?
+            .unwrap();
+        tokio::fs::remove_file(path).await?;
+    }
+
+    Ok(())
+}
+
+/// Wrapper to instantiate a tokio runtime
+fn ingest_main(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    runtime.block_on(async move {
+        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
+        if let Err(e) = r {
+            panic!("{e:?}");
+        }
+    });
+}
+
+/// Declare a series of benchmarks for the Pageserver's ingest write path.
+///
+/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
+/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
+///
+/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
+/// a fast disk, CPU is the bottleneck at time of writing.
+fn criterion_benchmark(c: &mut Criterion) {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
+    eprintln!("Data directory: {}", temp_dir.path());
+
+    let conf: &'static PageServerConf = Box::leak(Box::new(
+        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
+    ));
+    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
+    page_cache::init(conf.page_cache_size);
+
+    {
+        let mut group = c.benchmark_group("ingest-small-values");
+        let put_size = 100usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/100b seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Random,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::RandomReuse(0x3ff),
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+
+    {
+        let mut group = c.benchmark_group("ingest-big-values");
+        let put_size = 8192usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/8k seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,45 +308,6 @@ paths:
            application/json:
              schema:
                type: string
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently add a gc blocking at the tenant level because of this timeline
-      responses:
-        "200":
-          description: OK
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently remove a tenant level gc blocking for this timeline
-      responses:
-        "200":
-          description: OK
-
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -935,7 +935,6 @@ async fn tenant_list_handler(
            generation: (*gen)
                .into()
                .expect("Tenants are always attached with a generation"),
-            gc_blocking: None,
        })
        .collect::<Vec<TenantInfo>>();

@@ -987,7 +986,6 @@ async fn tenant_status(
                    .generation()
                    .into()
                    .expect("Tenants are always attached with a generation"),
-                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1228,72 +1226,6 @@ async fn evict_timeline_layer_handler(
    }
 }

-async fn timeline_gc_blocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, true).await
-}
-
-async fn timeline_gc_unblocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, false).await
-}
-
-/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
-///
-/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
-async fn block_or_unblock_gc(
-    request: Request<Body>,
-    block: bool,
-) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::{
-        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
-    };
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let state = get_state(&request);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    let timeline = tenant.get_timeline(timeline_id, true)?;
-
-    let fut = async {
-        if block {
-            timeline.block_gc(&tenant).await.map(|_| ())
-        } else {
-            timeline.unblock_gc(&tenant).await
-        }
-    };
-
-    let span = tracing::info_span!(
-        "block_or_unblock_gc",
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %timeline_id,
-        block = block,
-    );
-
-    let res = fut.instrument(span).await;
-
-    res.map_err(|e| {
-        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
-            ApiError::ShuttingDown
-        } else {
-            ApiError::InternalServerError(e)
-        }
-    })?;
-
-    json_response(StatusCode::OK, ())
-}
-
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
    inputs: ModelInputs,
@@ -2972,14 +2904,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
-            |r| api_handler(r, timeline_gc_blocking_handler),
-        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
-            |r| api_handler(r, timeline_gc_unblocking_handler),
-        )
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

-pub(crate) enum Inner {
+pub enum Inner {
    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
        }
    }

-    pub(crate) fn inner(&self) -> &Arc<Inner> {
+    pub fn inner(&self) -> &Arc<Inner> {
        &self.0
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,7 +15,6 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -174,6 +172,7 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
+            pending_bytes: 0,
            lsn,
        }
    }
@@ -1058,14 +1057,26 @@ pub struct DatadirModification<'a> {
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+
+    /// An **approximation** of how large our EphemeralFile write will be when committed.
+    pending_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
+    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
+    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
+    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
+    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
+
    /// Get the current lsn
    pub(crate) fn get_lsn(&self) -> Lsn {
        self.lsn
    }

+    pub(crate) fn approx_pending_bytes(&self) -> usize {
+        self.pending_bytes
+    }
+
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1793,11 +1804,12 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
+            let mut write_batch = Vec::new();
            for (lsn, value) in values {
                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
+                    write_batch.push((key, lsn, value));
                } else {
                    retained_pending_updates
                        .entry(key)
@@ -1805,9 +1817,11 @@ impl<'a> DatadirModification<'a> {
                        .push((lsn, value));
                }
            }
+            writer.put_batch(write_batch, ctx).await?;
        }

        self.pending_updates = retained_pending_updates;
+        self.pending_bytes = 0;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1833,17 +1847,20 @@ impl<'a> DatadirModification<'a> {
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            // Ordering: the items in this batch do not need to be in any global order, but values for
+            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+            // this to do efficient updates to its index.
+            let batch: Vec<(Key, Lsn, Value)> = self
+                .pending_updates
+                .drain()
+                .flat_map(|(key, values)| {
+                    values
+                        .into_iter()
+                        .map(move |(lsn, value)| (key, lsn, value))
+                })
+                .collect::<Vec<_>>();

-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1868,6 +1885,8 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

+        self.pending_bytes = 0;
+
        Ok(())
    }

@@ -1918,6 +1937,10 @@ impl<'a> DatadirModification<'a> {
                return;
            }
        }
+        self.pending_bytes += match &val {
+            Value::Image(inner) => inner.len(),
+            Value::WalRecord(_) => 100, // Rough approximation of typical serialized WalRecord size.
+        };
        values.push((self.lsn, val));
    }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -148,7 +148,6 @@ pub(crate) mod timeline;

 pub mod size;

-mod gc_block;
 pub(crate) mod throttle;

 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -304,12 +303,6 @@ pub struct Tenant {
    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

-    /// `index_part.json` based gc blocking reason tracking.
-    ///
-    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
-    /// proceeding.
-    pub(crate) gc_block: gc_block::GcBlock,
-
    l0_flush_global_state: L0FlushGlobalState,
 }

@@ -1043,8 +1036,6 @@ impl Tenant {
            }
        }

-        let mut gc_blocks = HashMap::new();
-
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
@@ -1054,16 +1045,6 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

-            if let Some(blocking) = index_part.gc_blocking.as_ref() {
-                // could just filter these away, but it helps while testing
-                anyhow::ensure!(
-                    !blocking.reasons.is_empty(),
-                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
-                );
-                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
-                assert!(prev.is_none());
-            }
-
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
@@ -1108,8 +1089,6 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        self.gc_block.set_scanned(gc_blocks);
-
        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
@@ -1700,14 +1679,6 @@ impl Tenant {
            }
        }

-        let _guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(reasons) => {
-                info!("Skipping GC: {reasons}");
-                return Ok(GcResult::default());
-            }
-        };
-
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }
@@ -2720,7 +2691,6 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
-            gc_block: Default::default(),
            l0_flush_global_state,
        }
    }
@@ -4122,7 +4092,7 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
-    use std::collections::{BTreeMap, BTreeSet};
+    use std::collections::BTreeMap;

    use super::*;
    use crate::keyspace::KeySpaceAccum;
@@ -4797,7 +4767,7 @@ mod tests {
        lsn: Lsn,
        repeat: usize,
        key_count: usize,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+    ) -> anyhow::Result<()> {
        let compact = true;
        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
    }
@@ -4810,9 +4780,7 @@ mod tests {
        repeat: usize,
        key_count: usize,
        compact: bool,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
-        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
-
+    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;

@@ -4833,7 +4801,6 @@ mod tests {
                        ctx,
                    )
                    .await?;
-                inserted.entry(test_key).or_default().insert(lsn);
                writer.finish_write(lsn);
                drop(writer);

@@ -4858,7 +4825,7 @@ mod tests {
            assert_eq!(res.layers_removed, 0, "this never removes anything");
        }

-        Ok(inserted)
+        Ok(())
    }

    //
@@ -4905,7 +4872,7 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
        guard.layer_map().dump(true, &ctx).await?;
@@ -4966,39 +4933,9 @@ mod tests {
                    &ctx,
                )
                .await;
-
-            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
-            let mut expect_missing = false;
-            let mut key = read.start().unwrap();
-            while key != read.end().unwrap() {
-                if let Some(lsns) = inserted.get(&key) {
-                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
-                    match expected_lsn {
-                        Some(lsn) => {
-                            expected_lsns.insert(key, *lsn);
-                        }
-                        None => {
-                            expect_missing = true;
-                            break;
-                        }
-                    }
-                } else {
-                    expect_missing = true;
-                    break;
-                }
-
-                key = key.next();
-            }
-
-            if expect_missing {
-                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
-            } else {
-                for (key, image) in vectored_res? {
-                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
-                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
-                    assert_eq!(image?, expected_image);
-                }
-            }
+            tline
+                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
+                .await;
        }

        Ok(())
@@ -5048,6 +4985,10 @@ mod tests {
            )
            .await;

+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
        let images = vectored_res?;
        assert!(images.is_empty());
        Ok(())
@@ -6958,10 +6899,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
@@ -7055,10 +6993,7 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        Ok(())
    }
@@ -7392,10 +7327,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for idx in 0..10 {
            assert_eq!(
@@ -7421,10 +7353,7 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        Ok(())
    }
@@ -7969,28 +7898,11 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        let mut dryrun_flags = EnumSet::new();
-        dryrun_flags.insert(CompactFlags::DryRun);
-
-        tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
-            .await
-            .unwrap();
-        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
-        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
-        verify_result().await;
-
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;

        // compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;

        // increase GC horizon and compact again
@@ -8000,17 +7912,11 @@ mod tests {
            guard.cutoffs.time = Lsn(0x38);
            guard.cutoffs.space = Lsn(0x38);
        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result

        // not increasing the GC horizon and compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;

        Ok(())
@@ -8191,10 +8097,7 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        branch_tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        verify_result().await;

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -79,6 +79,8 @@ impl EphemeralFile {
        self.rw.read_blk(blknum, ctx).await
    }

+    #[cfg(test)]
+    // This is a test helper: outside of tests, we are always written do via a pre-serialized batch.
    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
@@ -86,17 +88,28 @@ impl EphemeralFile {
    ) -> Result<u64, io::Error> {
        let pos = self.rw.bytes_written();

-        // Write the length field
-        if srcbuf.len() < 0x80 {
-            // short one-byte length header
-            let len_buf = [srcbuf.len() as u8];
+        let mut len_bytes = std::io::Cursor::new(Vec::new());
+        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
+            srcbuf.len(),
+            &mut len_bytes,
+        );
+        let len_bytes = len_bytes.into_inner();

-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        } else {
-            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
-            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        }
+        // Write the length field
+        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+
+        // Write the payload
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+
+        Ok(pos)
+    }
+
+    pub(crate) async fn write_raw(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();

        // Write the payload
        self.rw.write_all_borrowed(srcbuf, ctx).await?;
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,213 +0,0 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
-
-use super::remote_timeline_client::index::GcBlockingReason;
-
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
-
-#[derive(Default)]
-pub(crate) struct GcBlock {
-    /// The timelines which have current reasons to block gc.
-    ///
-    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
-    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
-    reasons: std::sync::Mutex<Storage>,
-    blocking: tokio::sync::Mutex<()>,
-}
-
-impl GcBlock {
-    /// Start another gc iteration.
-    ///
-    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
-    /// it's ending, or if not currently possible, a value describing the reasons why not.
-    ///
-    /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
-        let reasons = {
-            let g = self.reasons.lock().unwrap();
-
-            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
-            // tests, we use everything. we should warn if the gc has been consecutively blocked
-            // for more than 1h (within single tenant session?).
-            BlockingReasons::clean_and_summarize(g)
-        };
-
-        if let Some(reasons) = reasons {
-            Err(reasons)
-        } else {
-            Ok(Guard {
-                _inner: self.blocking.lock().await,
-            })
-        }
-    }
-
-    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
-        let g = self.reasons.lock().unwrap();
-
-        BlockingReasons::summarize(&g)
-    }
-
-    /// Start blocking gc for this one timeline for the given reason.
-    ///
-    /// This is not a guard based API but instead it mimics set API. The returned future will not
-    /// resolve until an existing gc round has completed.
-    ///
-    /// Returns true if this block was new, false if gc was already blocked for this reason.
-    ///
-    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
-    /// keep the gc blocking reason.
-    pub(crate) async fn insert(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<bool> {
-        let (added, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
-            let added = set.insert(reason);
-
-            // LOCK ORDER: intentionally hold the lock, see self.reasons.
-            let uploaded = timeline
-                .remote_client
-                .schedule_insert_gc_block_reason(reason)?;
-
-            (added, uploaded)
-        };
-
-        uploaded.await?;
-
-        // ensure that any ongoing gc iteration has completed
-        drop(self.blocking.lock().await);
-
-        Ok(added)
-    }
-
-    /// Remove blocking gc for this one timeline and the given reason.
-    pub(crate) async fn remove(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<()> {
-        use std::collections::hash_map::Entry;
-
-        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
-
-        let (remaining_blocks, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
-                Entry::Occupied(mut oe) => {
-                    let set = oe.get_mut();
-                    set.remove(reason);
-                    if set.is_empty() {
-                        oe.remove();
-                    }
-                }
-                Entry::Vacant(_) => {
-                    // we must still do the index_part.json update regardless, in case we had earlier
-                    // been cancelled
-                }
-            }
-
-            let remaining_blocks = g.len();
-
-            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
-            let uploaded = timeline
-                .remote_client
-                .schedule_remove_gc_block_reason(reason)?;
-
-            (remaining_blocks, uploaded)
-        };
-        uploaded.await?;
-
-        // no need to synchronize with gc iteration again
-
-        if remaining_blocks > 0 {
-            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
-        } else {
-            tracing::info!("gc is now unblocked for the tenant");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
-        let unblocked = {
-            let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
-                return;
-            }
-
-            g.remove(&timeline.timeline_id);
-
-            BlockingReasons::clean_and_summarize(g).is_none()
-        };
-
-        if unblocked {
-            tracing::info!("gc is now unblocked following deletion");
-        }
-    }
-
-    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
-        let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
-
-        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
-            tracing::info!(summary=?reasons, "initialized with gc blocked");
-        }
-    }
-}
-
-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
-}
-
-#[derive(Debug)]
-pub(crate) struct BlockingReasons {
-    timelines: usize,
-    reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-impl std::fmt::Display for BlockingReasons {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
-        )
-    }
-}
-
-impl BlockingReasons {
-    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
-            reasons = reasons.union(*value);
-            !value.is_empty()
-        });
-        if !g.is_empty() {
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        } else {
-            None
-        }
-    }
-
-    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
-            None
-        } else {
-            let reasons = g
-                .values()
-                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,123 +800,6 @@ impl RemoteTimelineClient {
            .context("wait completion")
    }

-    /// Adds a gc blocking reason for this timeline if one does not exist already.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_insert_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
-                    drop(guard);
-                    panic!("cannot start detach ancestor if there is nothing to detach from");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                // Usual case: !wanted(x) && !wanted(y)
-                //
-                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
-                // turn on and off some reason.
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        // this could be avoided by having external in-memory synchronization, like
-                        // timeline detach ancestor
-                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
-                    }
-
-                    // at this point, the metadata must always show that there is a parent
-                    upload_queue.dirty.gc_blocking = current
-                        .map(|x| x.with_reason(reason))
-                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
-    }
-
-    /// Removes a gc blocking reason for this timeline if one exists.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_remove_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue
-                    .clean
-                    .0
-                    .lineage
-                    .is_detached_from_original_ancestor()
-                {
-                    drop(guard);
-                    panic!("cannot complete timeline_ancestor_detach while not detached");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| {
-                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
-            };
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
-                    }
-
-                    upload_queue.dirty.gc_blocking =
-                        current.as_ref().and_then(|x| x.without_reason(reason));
-                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
-                    // FIXME: bogus ?
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
-    }
-
    /// Launch an upload operation in the background; the file is added to be included in next
    /// `index_part.json` upload.
    pub(crate) fn schedule_layer_file_upload(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,9 +60,6 @@ pub struct IndexPart {
    #[serde(default)]
    pub(crate) lineage: Lineage,

-    #[serde(skip_serializing_if = "Option::is_none", default)]
-    pub(crate) gc_blocking: Option<GcBlocking>,
-
    /// Describes the kind of aux files stored in the timeline.
    ///
    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -88,11 +85,10 @@ impl IndexPart {
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
    /// - 8: added `archived_at`
-    /// - 9: +gc_blocking
-    const LATEST_VERSION: usize = 9;
+    const LATEST_VERSION: usize = 8;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -105,7 +101,6 @@ impl IndexPart {
            deleted_at: None,
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        }
    }
@@ -256,64 +251,6 @@ impl Lineage {
    }
 }

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub(crate) struct GcBlocking {
-    pub(crate) started_at: NaiveDateTime,
-    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
-#[enumset(serialize_repr = "list")]
-pub(crate) enum GcBlockingReason {
-    Manual,
-    DetachAncestor,
-}
-
-impl GcBlocking {
-    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
-        GcBlocking {
-            started_at: chrono::Utc::now().naive_utc(),
-            reasons: enumset::EnumSet::only(reason),
-        }
-    }
-
-    /// Returns true if the given reason is one of the reasons why the gc is blocked.
-    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
-        self.reasons.contains(reason)
-    }
-
-    /// Returns a version of self with the given reason.
-    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
-        assert!(!self.blocked_by(reason));
-        let mut reasons = self.reasons;
-        reasons.insert(reason);
-
-        Self {
-            started_at: self.started_at,
-            reasons,
-        }
-    }
-
-    /// Returns a version of self without the given reason. Assumption is that if
-    /// there are no more reasons, we can unblock the gc by returning `None`.
-    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
-        assert!(self.blocked_by(reason));
-
-        if self.reasons.len() == 1 {
-            None
-        } else {
-            let mut reasons = self.reasons;
-            assert!(reasons.remove(reason));
-            assert!(!reasons.is_empty());
-
-            Some(Self {
-                started_at: self.started_at,
-                reasons,
-            })
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -355,7 +292,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -399,7 +335,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -444,7 +379,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -492,7 +426,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -535,7 +468,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -581,7 +513,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -632,7 +563,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

@@ -688,7 +618,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -745,7 +674,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -753,68 +681,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v9_indexpart_is_parsed() {
-        let example = r#"{
-            "version": 9,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata": {
-                "disk_consistent_lsn": "0/16960E8",
-                "prev_record_lsn": "0/1696070",
-                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/1696070",
-                "initdb_lsn": "0/1696070",
-                "pg_version": 14
-            },
-            "gc_blocking": {
-                "started_at": "2024-07-19T09:00:00.123",
-                "reasons": ["DetachAncestor"]
-            }
-        }"#;
-
-        let expected = IndexPart {
-            version: 9,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::new(
-                Lsn::from_str("0/16960E8").unwrap(),
-                Some(Lsn::from_str("0/1696070").unwrap()),
-                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
-                Lsn::INVALID,
-                Lsn::from_str("0/1696070").unwrap(),
-                Lsn::from_str("0/1696070").unwrap(),
-                14,
-            ).with_recalculated_checksum().unwrap(),
-            deleted_at: None,
-            lineage: Default::default(),
-            gc_blocking: Some(GcBlocking {
-                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
-                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
-            }),
-            last_aux_file_policy: Default::default(),
-            archived_at: None,
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -29,6 +29,7 @@ use utils::lsn::Lsn;
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
+pub use inmemory_layer::SerializedBatch;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};

@@ -435,6 +436,21 @@ impl ReadableLayer {
    }
 }

+/// Return value from [`Layer::get_value_reconstruct_data`]
+#[derive(Clone, Copy, Debug)]
+pub enum ValueReconstructResult {
+    /// Got all the data needed to reconstruct the requested page
+    Complete,
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue,
+
+    /// This layer didn't contain data needed to reconstruct the page version at
+    /// the returned LSN. This is usually considered an error, but might be OK
+    /// in some circumstances.
+    Missing,
+}
+
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,13 +36,13 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::Layer;
+use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -73,8 +73,7 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
-    ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ValuesReconstructState,
 };

 ///
@@ -373,7 +372,6 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
-    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
@@ -420,7 +418,6 @@ impl DeltaLayerWriterInner {
        let tree_builder = DiskBtreeBuilder::new(block_buf);

        Ok(Self {
-            conf,
            path,
            timeline_id,
            tenant_shard_id,
@@ -495,11 +492,10 @@ impl DeltaLayerWriterInner {
    async fn finish(
        self,
        key_end: Key,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
+        let result = self.finish0(key_end, ctx).await;
        if result.is_err() {
            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
            if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -512,9 +508,8 @@ impl DeltaLayerWriterInner {
    async fn finish0(
        self,
        key_end: Key,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -579,11 +574,9 @@ impl DeltaLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created delta layer {}", self.path);

-        trace!("created delta layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
    }
 }

@@ -684,14 +677,9 @@ impl DeltaLayerWriter {
    pub(crate) async fn finish(
        mut self,
        key_end: Key,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(key_end, ctx).await
    }

    #[cfg(test)]
@@ -826,6 +814,95 @@ impl DeltaLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let mut need_image = true;
+        // Scan the page versions backwards, starting from `lsn`.
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            &block_reader,
+        );
+        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+        tree_reader
+            .visit(
+                &search_key.0,
+                VisitDirection::Backwards,
+                |key, value| {
+                    let blob_ref = BlobRef(value);
+                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                        return false;
+                    }
+                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                    if entry_lsn < lsn_range.start {
+                        return false;
+                    }
+                    offsets.push((entry_lsn, blob_ref.pos()));
+
+                    !blob_ref.will_init()
+                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
+            )
+            .await?;
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        // Ok, 'offsets' now contains the offsets of all the entries we need to read
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        for (entry_lsn, pos) in offsets {
+            cursor
+                .read_blob_into_buf(pos, &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            match val {
+                Value::Image(img) => {
+                    reconstruct_state.img = Some((entry_lsn, img));
+                    need_image = false;
+                    break;
+                }
+                Value::WalRecord(rec) => {
+                    let will_init = rec.will_init();
+                    reconstruct_state.records.push((entry_lsn, rec));
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_image = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -1598,8 +1675,9 @@ pub(crate) mod test {
    use super::*;
    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
+    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::Tenant;
+    use crate::tenant::{Tenant, Timeline};
    use crate::{
        context::DownloadBehavior,
        task_mgr::TaskKind,
@@ -1893,9 +1971,8 @@ pub(crate) mod test {
            res?;
        }

-        let resident = writer
-            .finish(entries_meta.key_range.end, &timeline, &ctx)
-            .await?;
+        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
+        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;

        let inner = resident.get_as_delta(&ctx).await?;

@@ -2084,7 +2161,8 @@ pub(crate) mod test {
                .await
                .unwrap();

-            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
+            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
+            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();

            copied_layer.get_as_delta(ctx).await.unwrap();

@@ -2212,7 +2290,9 @@ pub(crate) mod test {
        for (key, lsn, value) in deltas {
            writer.put_value(key, lsn, value, ctx).await?;
        }
-        let delta_layer = writer.finish(key_end, tline, ctx).await?;
+
+        let (desc, path) = writer.finish(key_end, ctx).await?;
+        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;

        Ok::<_, anyhow::Error>(delta_layer)
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,7 +32,9 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::LayerAccessStats;
+use crate::tenant::storage_layer::{
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -427,6 +429,46 @@ impl ImageLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader
+            .get(
+                &keybuf,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                    .build(),
+            )
+            .await?
+        {
+            let blob = block_reader
+                .block_cursor()
+                .read_blob(
+                    offset,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerValue)
+                        .build(),
+                )
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    pub(super) async fn get_values_reconstruct_data(
@@ -711,10 +753,6 @@ struct ImageLayerWriterInner {
 }

 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
    ///
    /// Start building a new image layer.
    ///
@@ -1006,10 +1044,6 @@ impl ImageLayerWriter {
            .finish(timeline, ctx, Some(end_key))
            .await
    }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
-    }
 }

 impl Drop for ImageLayerWriter {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,10 +10,12 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
+use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, ensure, Result};
+use camino::Utf8PathBuf;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -30,13 +32,46 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;

-use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValuesReconstructState};
+use super::{
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
+};

 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);

+#[derive(Ord, PartialOrd, Eq, PartialEq)]
+struct IndexPrefix {
+    field1: u8,
+    field2: u32,
+    field3: u32,
+    field4: u32,
+    field5: u8,
+}
+
+fn materialize_key(prefix: &IndexPrefix, blkno: u32) -> Key {
+    Key {
+        field1: prefix.field1,
+        field2: prefix.field2,
+        field3: prefix.field3,
+        field4: prefix.field4,
+        field5: prefix.field5,
+        field6: blkno,
+    }
+}
+
+fn key_to_prefix(key: &Key) -> IndexPrefix {
+    IndexPrefix {
+        field1: key.field1,
+        field2: key.field2,
+        field3: key.field3,
+        field4: key.field4,
+        field5: key.field5,
+    }
+}
+
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
@@ -51,6 +86,9 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    pub(crate) end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
    frozen_local_path_str: OnceLock<Arc<str>>,

@@ -75,7 +113,7 @@ pub struct InMemoryLayerInner {
    /// All versions of all pages in the layer are kept here. Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<Key, VecMap<Lsn, u64>>,
+    index: BTreeMap<IndexPrefix, BTreeMap<u32, VecMap<Lsn, u64>>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -241,6 +279,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -260,36 +304,96 @@ impl InMemoryLayer {

        let cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
-        for (key, vec_map) in inner.index.iter() {
-            for (lsn, pos) in vec_map.as_slice() {
-                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                let val = Value::des(&buf);
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+        for (key_prefix, inner) in inner.index.iter() {
+            for (blkno, vec_map) in inner {
+                let key = materialize_key(key_prefix, *blkno);
+
+                for (lsn, pos) in vec_map.as_slice() {
+                    let mut desc = String::new();
+                    cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                    let val = Value::des(&buf);
+                    match val {
+                        Ok(Value::Image(img)) => {
+                            write!(&mut desc, " img {} bytes", img.len())?;
+                        }
+                        Ok(Value::WalRecord(rec)) => {
+                            let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
+                            write!(
+                                &mut desc,
+                                " rec {} bytes will_init: {} {}",
+                                buf.len(),
+                                rec.will_init(),
+                                wal_desc
+                            )?;
+                        }
+                        Err(err) => {
+                            write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+                        }
                    }
+                    println!("  key {} at {}: {}", key, lsn, desc);
                }
-                println!("  key {} at {}: {}", key, lsn, desc);
            }
        }

        Ok(())
    }

+    /// Look up given value in the layer.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.start_lsn);
+        let mut need_image = true;
+
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+
+        let reader = inner.file.block_cursor();
+
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(inner) = inner.index.get(&key_to_prefix(&key)) {
+            if let Some(vec_map) = inner.get(&key.field6) {
+                let slice = vec_map.slice_range(lsn_range);
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    let buf = reader.read_blob(*pos, &ctx).await?;
+                    let value = Value::des(&buf)?;
+                    match value {
+                        Value::Image(img) => {
+                            reconstruct_state.img = Some((*entry_lsn, img));
+                            return Ok(ValueReconstructResult::Complete);
+                        }
+                        Value::WalRecord(rec) => {
+                            let will_init = rec.will_init();
+                            reconstruct_state.records.push((*entry_lsn, rec));
+                            if will_init {
+                                // This WAL record initializes the page, so no need to go further back
+                                need_image = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // release lock on 'inner'
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -309,34 +413,54 @@ impl InMemoryLayer {
        let reader = inner.file.block_cursor();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner.index.range(range.start..range.end) {
-                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
-                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                    None => self.start_lsn..end_lsn,
+            let range_incl = range.start..=Key::from_i128(Key::to_i128(&range.end) - 1);
+
+            let prefix_start = key_to_prefix(&range.start);
+            let prefix_end = key_to_prefix(&range.end);
+
+            for (prefix, relation_idx) in inner.index.range(prefix_start..=prefix_end) {
+                let blkno_start = if prefix == &key_to_prefix(&range_incl.start()) {
+                    range_incl.start().field6
+                } else {
+                    0
                };

-                let slice = vec_map.slice_range(lsn_range);
+                let blkno_end = if prefix == &key_to_prefix(&range_incl.end()) {
+                    range_incl.end().field6
+                } else {
+                    0xffffffff
+                };

-                for (entry_lsn, pos) in slice.iter().rev() {
-                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-                    let buf = reader.read_blob(*pos, &ctx).await;
-                    if let Err(e) = buf {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
-                        break;
-                    }
+                for (blkno, vec_map) in relation_idx.range(blkno_start..=blkno_end) {
+                    let key = materialize_key(prefix, *blkno);
+                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                        None => self.start_lsn..end_lsn,
+                    };

-                    let value = Value::des(&buf.unwrap());
-                    if let Err(e) = value {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
-                        break;
-                    }
+                    let slice = vec_map.slice_range(lsn_range);

-                    let key_situation =
-                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
-                    if key_situation == ValueReconstructSituation::Complete {
-                        break;
+                    for (entry_lsn, pos) in slice.iter().rev() {
+                        // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
+                        let buf = reader.read_blob(*pos, &ctx).await;
+                        if let Err(e) = buf {
+                            reconstruct_state
+                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                            break;
+                        }
+
+                        let value = Value::des(&buf.unwrap());
+                        if let Err(e) = value {
+                            reconstruct_state
+                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                            break;
+                        }
+
+                        let key_situation =
+                            reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
+                        if key_situation == ValueReconstructSituation::Complete {
+                            break;
+                        }
                    }
                }
            }
@@ -348,6 +472,74 @@ impl InMemoryLayer {
    }
 }

+pub struct SerializedBatch {
+    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
+    pub(crate) raw: Vec<u8>,
+
+    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
+    pub(crate) offsets: Vec<(Key, Lsn, u64)>,
+
+    /// The highest LSN of any value in the batch
+    pub(crate) max_lsn: Lsn,
+}
+
+impl SerializedBatch {
+    /// Write a blob length in the internal format of the EphemeralFile
+    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
+        use std::io::Write;
+
+        if len < 0x80 {
+            // short one-byte length header
+            let len_buf = [len as u8];
+
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        } else {
+            let mut len_buf = u32::to_be_bytes(len as u32);
+            len_buf[0] |= 0x80;
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        }
+    }
+
+    pub fn from_values(batch: Vec<(Key, Lsn, Value)>) -> Self {
+        use std::io::Write;
+
+        let mut offsets: Vec<(Key, Lsn, u64)> = Vec::new();
+        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(batch.len() * 8192));
+        let mut max_lsn: Lsn = Lsn(0);
+        let mut value_buf = smallvec::SmallVec::<[u8; 256]>::new();
+        for (key, lsn, val) in batch {
+            let relative_off = cursor.position();
+
+            value_buf.clear();
+            val.ser_into(&mut value_buf)
+                .expect("Value serialization is infallible");
+            Self::write_blob_length(value_buf.len(), &mut cursor);
+
+            cursor
+                .write_all(&value_buf)
+                .expect("Writing to Vec is infallible");
+
+            // We can't write straight into the buffer, because the InMemoryLayer file format requires
+            // the size to come before the value.  However... we could probably calculate the size before
+            // actually serializing the value
+            //val.ser_into(&mut cursor)?;
+
+            offsets.push((key, lsn, relative_off));
+            max_lsn = std::cmp::max(max_lsn, lsn);
+        }
+
+        Self {
+            raw: cursor.into_inner(),
+            offsets,
+            max_lsn,
+        }
+    }
+}
+
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -391,6 +583,11 @@ impl InMemoryLayer {

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
@@ -406,38 +603,20 @@ impl InMemoryLayer {
        })
    }

-    // Write operations
-
-    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
-    /// Adds the page version to the in-memory tree
-
-    pub(crate) async fn put_value(
+    // Write path.
+    pub async fn put_batch(
        &self,
-        key: Key,
-        lsn: Lsn,
-        buf: &[u8],
+        serialized_batch: &SerializedBatch,
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
-    }
+        //self.assert_writable();

-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: Key,
-        lsn: Lsn,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-
-        let off = {
-            locked_inner
+        let base_off = {
+            inner
                .file
-                .write_blob(
-                    buf,
+                .write_raw(
+                    &serialized_batch.raw,
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::InMemoryLayer)
                        .build(),
@@ -445,15 +624,21 @@ impl InMemoryLayer {
                .await?
        };

-        let vec_map = locked_inner.index.entry(key).or_default();
-        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            warn!("Key {} at {} already exists", key, lsn);
+        for (key, lsn, relative_off) in &serialized_batch.offsets {
+            let prefix = key_to_prefix(&key);
+
+            let relation_idx = match inner.index.get_mut(&prefix) {
+                Some(i) => i,
+                None => inner.index.entry(prefix).or_default(),
+            };
+
+            let off = base_off + relative_off;
+            let vec_map = relation_idx.entry(key.field6).or_default();
+            vec_map.append_fast(*lsn, off);
        }

-        let size = locked_inner.file.len();
-        locked_inner.resource_units.maybe_publish_size(size);
+        let size = inner.file.len();
+        inner.resource_units.maybe_publish_size(size);

        Ok(())
    }
@@ -476,8 +661,6 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().await;
-
        assert!(
            self.start_lsn < end_lsn,
            "{} >= {}",
@@ -495,11 +678,15 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        for vec_map in inner.index.values() {
-            for (lsn, _pos) in vec_map.as_slice() {
-                assert!(*lsn < end_lsn);
-            }
-        }
+        // #[cfg(debug_assertions)]
+        // {
+        //     let inner = self.inner.write().await;
+        //     for vec_map in inner.index.values() {
+        //         for (lsn, _pos) in vec_map.as_slice() {
+        //             assert!(*lsn < end_lsn);
+        //         }
+        //     }
+        // }
    }

    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
@@ -507,12 +694,12 @@ impl InMemoryLayer {
    /// if there are no matching keys.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
+    pub async fn write_to_disk(
        &self,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+        l0_flush_global_state: &l0_flush::Inner,
+    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -524,9 +711,8 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;

-        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
        use l0_flush::Inner;
-        let _concurrency_permit = match &*l0_flush_global_state {
+        let _concurrency_permit = match l0_flush_global_state {
            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };
@@ -534,11 +720,12 @@ impl InMemoryLayer {
        let end_lsn = *self.end_lsn.get().unwrap();

        let key_count = if let Some(key_range) = key_range {
-            inner
-                .index
-                .iter()
-                .filter(|(k, _)| key_range.contains(k))
-                .count()
+            panic!("Update for IndexPrefix");
+            // inner
+            //     .index
+            //     .iter()
+            //     .filter(|(k, _)| key_range.contains(k))
+            //     .count()
        } else {
            inner.index.len()
        };
@@ -556,7 +743,7 @@ impl InMemoryLayer {
        )
        .await?;

-        match &*l0_flush_global_state {
+        match l0_flush_global_state {
            l0_flush::Inner::PageCached => {
                let ctx = RequestContextBuilder::extend(ctx)
                    .page_content_kind(PageContentKind::InMemoryLayer)
@@ -566,16 +753,20 @@ impl InMemoryLayer {

                let cursor = inner.file.block_cursor();

-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
-                            .await;
-                        res?;
+                for (key_prefix, inner) in inner.index.iter() {
+                    for (blkno, vec_map) in inner {
+                        let key = materialize_key(key_prefix, *blkno);
+
+                        // Write all page versions
+                        for (lsn, pos) in vec_map.as_slice() {
+                            cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                            let will_init = Value::des(&buf)?.will_init();
+                            let res;
+                            (buf, res) = delta_layer_writer
+                                .put_value_bytes(key, *lsn, buf, will_init, &ctx)
+                                .await;
+                            res?;
+                        }
                    }
                }
            }
@@ -599,29 +790,32 @@ impl InMemoryLayer {

                let mut buf = Vec::new();

-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        // TODO: once we have blob lengths in the in-memory index, we can
-                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
-                        // 2. load the file contents into a Bytes and
-                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
-                        // 4. pass that `buf` into `put_value_bytes`
-                        // => https://github.com/neondatabase/neon/issues/8183
-                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
-                            .await;
-                        res?;
+                for (key_prefix, inner) in inner.index.iter() {
+                    for (blkno, vec_map) in inner {
+                        // Write all page versions
+                        let key = materialize_key(key_prefix, *blkno);
+                        for (lsn, pos) in vec_map.as_slice() {
+                            // TODO: once we have blob lengths in the in-memory index, we can
+                            // 1. get rid of the blob_io / BlockReaderRef::Slice business and
+                            // 2. load the file contents into a Bytes and
+                            // 3. the use `Bytes::slice` to get the `buf` that is our blob
+                            // 4. pass that `buf` into `put_value_bytes`
+                            // => https://github.com/neondatabase/neon/issues/8183
+                            cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                            let will_init = Value::des(&buf)?.will_init();
+                            let res;
+                            (buf, res) = delta_layer_writer
+                                .put_value_bytes(key, *lsn, buf, will_init, ctx)
+                                .await;
+                            res?;
+                        }
                    }
                }
            }
        }

        // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;

        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
        //
@@ -633,6 +827,6 @@ impl InMemoryLayer {
        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
        drop(_concurrency_permit);

-        Ok(Some(delta_layer))
+        Ok(Some((desc, path)))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
+    ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -300,6 +301,42 @@ impl Layer {
        self.0.delete_on_drop();
    }

+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from the previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use anyhow::ensure;
+
+        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
+        self.0.access_stats.record_access(ctx);
+
+        if self.layer_desc().is_delta {
+            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
+            ensure!(self.layer_desc().key_range.contains(&key));
+        } else {
+            ensure!(self.layer_desc().key_range.contains(&key));
+            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
+            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
+        }
+
+        layer
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
+            .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
+    }
+
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -404,6 +441,10 @@ impl Layer {
        &self.0.path
    }

+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -478,7 +519,7 @@ impl Layer {
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_values_reconstruct_data`].
+/// read with [`Layer::get_value_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -559,6 +600,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -792,6 +836,9 @@ impl LayerInner {

        LayerInner {
            conf,
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1712,6 +1759,28 @@ impl DownloadedLayer {
            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
    }

+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => {
+                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
+                    .await
+            }
+        }
+    }
+
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -50,26 +50,13 @@ async fn smoke_test() {
    // all layers created at pageserver are like `layer`, initialized with strong
    // Arc<DownloadedLayer>.

-    let controlfile_keyspace = KeySpace {
-        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
-    };
-
    let img_before = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
+        data.img
            .take()
            .expect("tenant harness writes the control file")
    };
@@ -87,24 +74,13 @@ async fn smoke_test() {

    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .instrument(download_span.clone())
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
-            .take()
-            .expect("tenant harness writes the control file")
+        data.img.take().unwrap()
    };

    assert_eq!(img_before, img_after);
@@ -854,7 +830,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
    assert_eq!(size_of::<LayerAccessStats>(), 8);
    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 296);
+    assert_eq!(size_of::<LayerInner>(), 312);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -4,6 +4,7 @@ use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};

+use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};

 use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
@@ -173,8 +174,9 @@ impl SplitDeltaLayerWriter {
            )
            .await?;
            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            self.generated_layers
-                .push(prev_delta_writer.finish(key, tline, ctx).await?);
+            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            self.generated_layers.push(delta_layer);
        }
        self.inner.put_value(key, lsn, val, ctx).await
    }
@@ -190,7 +192,10 @@ impl SplitDeltaLayerWriter {
            inner,
            ..
        } = self;
-        generated_layers.push(inner.finish(end_key, tline, ctx).await?);
+
+        let (desc, path) = inner.finish(end_key, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+        generated_layers.push(delta_layer);
        Ok(generated_layers)
    }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -47,7 +47,6 @@ use utils::{
    bin_ser::BeSer,
    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
-    vec_map::VecMap,
 };

 use std::pin::pin;
@@ -59,7 +58,10 @@ use std::{
    collections::{BTreeMap, HashMap, HashSet},
    sync::atomic::AtomicU64,
 };
-use std::{cmp::min, ops::ControlFlow};
+use std::{
+    cmp::{max, min},
+    ops::ControlFlow,
+};
 use std::{
    collections::btree_map::Entry,
    ops::{Deref, Range},
@@ -84,8 +86,8 @@ use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
        AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
-        ValuesReconstructState,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
+        ValueReconstructState, ValuesReconstructState,
    },
 };
 use crate::{
@@ -137,7 +139,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{config::TenantConf, upload_queue::NotInitialized};
+use super::{config::TenantConf, storage_layer::inmemory_layer, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -540,6 +542,7 @@ pub struct MissingKeyError {
    cont_lsn: Lsn,
    request_lsn: Lsn,
    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
    backtrace: Option<std::backtrace::Backtrace>,
 }

@@ -560,6 +563,18 @@ impl std::fmt::Display for MissingKeyError {
            write!(f, ", ancestor {}", ancestor_lsn)?;
        }

+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
        if let Some(ref backtrace) = self.backtrace {
            write!(f, "\n{}", backtrace)?;
        }
@@ -688,7 +703,6 @@ pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
    EnhancedGcBottomMostCompaction,
-    DryRun,
 }

 impl std::fmt::Debug for Timeline {
@@ -902,44 +916,119 @@ impl Timeline {

        self.timeline_get_throttle.throttle(ctx, 1).await;

-        let keyspace = KeySpace {
-            ranges: vec![key..key.next()],
-        };
+        match self.conf.get_impl {
+            GetImpl::Legacy => {
+                let reconstruct_state = ValueReconstructState {
+                    records: Vec::new(),
+                    img: None,
+                };

-        // Initialise the reconstruct state for the key with the cache
-        // entry returned above.
-        let mut reconstruct_state = ValuesReconstructState::new();
+                self.get_impl(key, lsn, reconstruct_state, ctx).await
+            }
+            GetImpl::Vectored => {
+                let keyspace = KeySpace {
+                    ranges: vec![key..key.next()],
+                };

-        let vectored_res = self
-            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
-            .await;
+                // Initialise the reconstruct state for the key with the cache
+                // entry returned above.
+                let mut reconstruct_state = ValuesReconstructState::new();

-        let key_value = vectored_res?.pop_first();
-        match key_value {
-            Some((got_key, value)) => {
-                if got_key != key {
-                    error!(
-                        "Expected {}, but singular vectored get returned {}",
-                        key, got_key
-                    );
-                    Err(PageReconstructError::Other(anyhow!(
-                        "Singular vectored get returned wrong key"
-                    )))
-                } else {
-                    value
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                    .await;
+
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
+
+                let key_value = vectored_res?.pop_first();
+                match key_value {
+                    Some((got_key, value)) => {
+                        if got_key != key {
+                            error!(
+                                "Expected {}, but singular vectored get returned {}",
+                                key, got_key
+                            );
+                            Err(PageReconstructError::Other(anyhow!(
+                                "Singular vectored get returned wrong key"
+                            )))
+                        } else {
+                            value
+                        }
+                    }
+                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn: Lsn(0),
+                        request_lsn: lsn,
+                        ancestor_lsn: None,
+                        traversal_path: Vec::new(),
+                        backtrace: None,
+                    })),
                }
            }
-            None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                key,
-                shard: self.shard_identity.get_shard_number(&key),
-                cont_lsn: Lsn(0),
-                request_lsn: lsn,
-                ancestor_lsn: None,
-                backtrace: None,
-            })),
        }
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        mut reconstruct_state: ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
+
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(GetKind::Singular)
+            .start_timer();
+        let path = self
+            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
+            .await?;
+        timer.stop_and_record();
+
+        let start = Instant::now();
+        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
+        let elapsed = start.elapsed();
+        crate::metrics::RECONSTRUCT_TIME
+            .for_get_kind(GetKind::Singular)
+            .observe(elapsed.as_secs_f64());
+
+        if cfg!(feature = "testing")
+            && res.is_err()
+            && !matches!(res, Err(PageReconstructError::Cancelled))
+        {
+            // it can only be walredo issue
+            use std::fmt::Write;
+
+            let mut msg = String::new();
+
+            path.into_iter().for_each(|(res, cont_lsn, layer)| {
+                writeln!(
+                    msg,
+                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
+                    layer,
+                )
+                .expect("string grows")
+            });
+
+            // this is to rule out or provide evidence that we could in some cases read a duplicate
+            // walrecord
+            tracing::info!("walredo failed, path:\n{msg}");
+        }
+
+        res
+    }
+
    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;

@@ -989,14 +1078,28 @@ impl Timeline {
            .throttle(ctx, key_count as usize)
            .await;

-        let res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(),
-                ctx,
-            )
-            .await;
+        let res = match self.conf.get_vectored_impl {
+            GetVectoredImpl::Sequential => {
+                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
+            }
+            GetVectoredImpl::Vectored => {
+                let vectored_res = self
+                    .get_vectored_impl(
+                        keyspace.clone(),
+                        lsn,
+                        &mut ValuesReconstructState::new(),
+                        ctx,
+                    )
+                    .await;
+
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
+
+                vectored_res
+            }
+        };

        if let Some((metric, start)) = start {
            let elapsed = start.elapsed();
@@ -1085,6 +1188,65 @@ impl Timeline {
        vectored_res
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
+    pub(super) async fn get_vectored_sequential_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut values = BTreeMap::new();
+
+        for range in keyspace.ranges {
+            let mut key = range.start;
+            while key != range.end {
+                let block = self
+                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
+                    .await;
+
+                use PageReconstructError::*;
+                match block {
+                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
+                    Err(MissingKey(_))
+                        if NON_INHERITED_RANGE.contains(&key)
+                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
+                    {
+                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
+                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
+                        // accordingly.
+                        key = key.next();
+                    }
+                    Err(MissingKey(err)) => {
+                        return Err(GetVectoredError::MissingKey(err));
+                    }
+                    Err(Other(err))
+                        if err
+                            .to_string()
+                            .contains("downloading evicted layer file failed") =>
+                    {
+                        return Err(GetVectoredError::Other(err))
+                    }
+                    Err(Other(err))
+                        if err
+                            .chain()
+                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
+                    {
+                        // The intent here is to achieve error parity with the vectored read path.
+                        // When vectored read fails to load a layer it fails the whole read, hence
+                        // we mimic this behaviour here to keep the validation happy.
+                        return Err(GetVectoredError::Other(err));
+                    }
+                    _ => {
+                        values.insert(key, block);
+                        key = key.next();
+                    }
+                }
+            }
+        }
+
+        Ok(values)
+    }
+
    pub(super) async fn get_vectored_impl(
        &self,
        keyspace: KeySpace,
@@ -1155,6 +1317,113 @@ impl Timeline {
        Ok(results)
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
+    pub(super) async fn validate_get_vectored_impl(
+        &self,
+        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) {
+        if keyspace.overlaps(&Key::metadata_key_range()) {
+            // skip validation for metadata key range
+            return;
+        }
+
+        let sequential_res = self
+            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
+            .await;
+
+        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
+            use GetVectoredError::*;
+            match (lhs, rhs) {
+                (Oversized(l), Oversized(r)) => l == r,
+                (InvalidLsn(l), InvalidLsn(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l.key == r.key,
+                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
+                (Other(_), Other(_)) => true,
+                _ => false,
+            }
+        }
+
+        match (&sequential_res, vectored_res) {
+            (Err(GetVectoredError::Cancelled), _) => {},
+            (_, Err(GetVectoredError::Cancelled)) => {},
+            (Err(seq_err), Ok(_)) => {
+                panic!(concat!("Sequential get failed with {}, but vectored get did not",
+                               " - keyspace={:?} lsn={}"),
+                       seq_err, keyspace, lsn) },
+            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
+                // Sequential get runs after vectored get, so it is possible for the later
+                // to time out while waiting for its ancestor's Lsn to become ready and for the
+                // former to succeed (it essentially has a doubled wait time).
+            },
+            (Ok(_), Err(vec_err)) => {
+                panic!(concat!("Vectored get failed with {}, but sequential get did not",
+                               " - keyspace={:?} lsn={}"),
+                       vec_err, keyspace, lsn) },
+            (Err(seq_err), Err(vec_err)) => {
+                assert!(errors_match(seq_err, vec_err),
+                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
+            (Ok(seq_values), Ok(vec_values)) => {
+                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
+                    assert_eq!(seq_key, vec_key);
+                    match (seq_res, vec_res) {
+                        (Ok(seq_blob), Ok(vec_blob)) => {
+                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
+                        },
+                        (Err(err), Ok(_)) => {
+                            panic!(
+                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Ok(_), Err(err)) => {
+                            panic!(
+                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Err(_), Err(_)) => {}
+                    }
+                })
+            }
+        }
+    }
+
+    fn validate_key_equivalence(
+        key: &Key,
+        keyspace: &KeySpace,
+        lsn: Lsn,
+        seq: &Bytes,
+        vec: &Bytes,
+    ) {
+        if *key == AUX_FILES_KEY {
+            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
+            // since it uses a hash map under the hood. Hence, deserialise both results
+            // before comparing.
+            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
+            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
+            match (&seq_aux_dir_res, &vec_aux_dir_res) {
+                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
+                    assert_eq!(
+                        seq_aux_dir, vec_aux_dir,
+                        "Mismatch for key {} - keyspace={:?} lsn={}",
+                        key, keyspace, lsn
+                    );
+                }
+                (Err(_), Err(_)) => {}
+                _ => {
+                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
+                }
+            }
+        } else {
+            // All other keys should reconstruct deterministically, so we simply compare the blobs.
+            assert_eq!(
+                seq, vec,
+                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
+            );
+        }
+    }
+
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
@@ -2944,7 +3213,228 @@ impl Timeline {
    }
 }

+type TraversalId = Arc<str>;
+
+trait TraversalLayerExt {
+    fn traversal_id(&self) -> TraversalId;
+}
+
+impl TraversalLayerExt for Layer {
+    fn traversal_id(&self) -> TraversalId {
+        Arc::clone(self.debug_str())
+    }
+}
+
+impl TraversalLayerExt for Arc<InMemoryLayer> {
+    fn traversal_id(&self) -> TraversalId {
+        Arc::clone(self.local_path_str())
+    }
+}
+
 impl Timeline {
+    ///
+    /// Get a handle to a Layer for reading.
+    ///
+    /// The returned Layer might be from an ancestor timeline, if the
+    /// segment hasn't been updated on this timeline yet.
+    ///
+    /// This function takes the current timeline's locked LayerMap as an argument,
+    /// so callers can avoid potential race conditions.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    async fn get_reconstruct_data(
+        &self,
+        key: Key,
+        request_lsn: Lsn,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
+        // Start from the current timeline.
+        let mut timeline_owned;
+        let mut timeline = self;
+
+        let mut read_count = scopeguard::guard(0, |cnt| {
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
+        });
+
+        // For debugging purposes, collect the path of layers that we traversed
+        // through. It's included in the error message if we fail to find the key.
+        let mut traversal_path = Vec::<TraversalPathItem>::new();
+
+        let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
+            *cached_lsn
+        } else {
+            Lsn(0)
+        };
+
+        // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
+        // to check that each iteration make some progress, to break infinite
+        // looping if something goes wrong.
+        let mut prev_lsn = None;
+
+        let mut result = ValueReconstructResult::Continue;
+        let mut cont_lsn = Lsn(request_lsn.0 + 1);
+
+        'outer: loop {
+            if self.cancel.is_cancelled() {
+                return Err(PageReconstructError::Cancelled);
+            }
+
+            // The function should have updated 'state'
+            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
+            match result {
+                ValueReconstructResult::Complete => return Ok(traversal_path),
+                ValueReconstructResult::Continue => {
+                    // If we reached an earlier cached page image, we're done.
+                    if cont_lsn == cached_lsn + 1 {
+                        return Ok(traversal_path);
+                    }
+                    if let Some(prev) = prev_lsn {
+                        if prev <= cont_lsn {
+                            // Didn't make any progress in last iteration. Error out to avoid
+                            // getting stuck in the loop.
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
+                                key,
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
+                                request_lsn,
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
+                        }
+                    }
+                    prev_lsn = Some(cont_lsn);
+                }
+                ValueReconstructResult::Missing => {
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
+                        traversal_path,
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
+                }
+            }
+
+            // Recurse into ancestor if needed
+            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
+                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                    trace!(
+                        "going into ancestor {}, cont_lsn is {}",
+                        timeline.ancestor_lsn,
+                        cont_lsn
+                    );
+
+                    timeline_owned = timeline
+                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                        .await?;
+                    timeline = &*timeline_owned;
+                    prev_lsn = None;
+                    continue 'outer;
+                }
+            }
+
+            let guard = timeline.layers.read().await;
+            let layers = guard.layer_map();
+
+            // Check the open and frozen in-memory layers first, in order from newest
+            // to oldest.
+            if let Some(open_layer) = &layers.open_layer {
+                let start_lsn = open_layer.get_lsn_range().start;
+                if cont_lsn > start_lsn {
+                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
+                    // Get all the data needed to reconstruct the page version from this layer.
+                    // But if we have an older cached page image, no need to go past that.
+                    let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let open_layer = open_layer.clone();
+                    drop(guard);
+
+                    result = match open_layer
+                        .get_value_reconstruct_data(
+                            key,
+                            lsn_floor..cont_lsn,
+                            reconstruct_state,
+                            ctx,
+                        )
+                        .await
+                    {
+                        Ok(result) => result,
+                        Err(e) => return Err(PageReconstructError::from(e)),
+                    };
+                    cont_lsn = lsn_floor;
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
+                    continue 'outer;
+                }
+            }
+            for frozen_layer in layers.frozen_layers.iter().rev() {
+                let start_lsn = frozen_layer.get_lsn_range().start;
+                if cont_lsn > start_lsn {
+                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
+                    let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let frozen_layer = frozen_layer.clone();
+                    drop(guard);
+
+                    result = match frozen_layer
+                        .get_value_reconstruct_data(
+                            key,
+                            lsn_floor..cont_lsn,
+                            reconstruct_state,
+                            ctx,
+                        )
+                        .await
+                    {
+                        Ok(result) => result,
+                        Err(e) => return Err(PageReconstructError::from(e)),
+                    };
+                    cont_lsn = lsn_floor;
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
+                    continue 'outer;
+                }
+            }
+
+            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
+                let layer = guard.get_from_desc(&layer);
+                drop(guard);
+                // Get all the data needed to reconstruct the page version from this layer.
+                // But if we have an older cached page image, no need to go past that.
+                let lsn_floor = max(cached_lsn + 1, lsn_floor);
+                result = match layer
+                    .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx)
+                    .await
+                {
+                    Ok(result) => result,
+                    Err(e) => return Err(PageReconstructError::from(e)),
+                };
+                cont_lsn = lsn_floor;
+                *read_count += 1;
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
+                continue 'outer;
+            } else if timeline.ancestor_timeline.is_some() {
+                // Nothing on this timeline. Traverse to parent
+                result = ValueReconstructResult::Continue;
+                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+                continue 'outer;
+            } else {
+                // Nothing found
+                result = ValueReconstructResult::Missing;
+                continue 'outer;
+            }
+        }
+    }
+
    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
@@ -3038,6 +3528,7 @@ impl Timeline {
                cont_lsn,
                request_lsn,
                ancestor_lsn: Some(timeline.ancestor_lsn),
+                traversal_path: vec![],
                backtrace: None,
            }));
        }
@@ -3630,11 +4121,17 @@ impl Timeline {

    /// Return true if the value changed
    ///
-    /// This function must only be used from the layer flush task.
+    /// This function must only be used from the layer flush task, and may not be called concurrently.
    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
-        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
-        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
-        new_value != old_value
+        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
+        let old_value = self.disk_consistent_lsn.load();
+        if new_value != old_value {
+            assert!(new_value >= old_value);
+            self.disk_consistent_lsn.store(new_value);
+            true
+        } else {
+            false
+        }
    }

    /// Update metadata file
@@ -3701,12 +4198,14 @@ impl Timeline {
        let frozen_layer = Arc::clone(frozen_layer);
        let ctx = ctx.attached_child();
        let work = async move {
-            let Some(new_delta) = frozen_layer
-                .write_to_disk(&self_clone, &ctx, key_range)
+            let Some((desc, path)) = frozen_layer
+                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
                .await?
            else {
                return Ok(None);
            };
+            let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?;
+
            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
            // We just need to fsync the directory in which these inodes are linked,
            // which we know to be the timeline directory.
@@ -4229,12 +4728,6 @@ impl Timeline {
            return;
        }

-        if self.current_logical_size.current_size().is_exact() {
-            // root timelines are initialized with exact count, but never start the background
-            // calculation
-            return;
-        }
-
        if let Some(await_bg_cancel) = self
            .current_logical_size
            .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -5205,22 +5698,6 @@ impl Timeline {
        }
    }

-    /// Persistently blocks gc for `Manual` reason.
-    ///
-    /// Returns true if no such block existed before, false otherwise.
-    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
-        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
-        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
-        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
-    }
-
-    /// Persistently unblocks gc for `Manual` reason.
-    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
-        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
-        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
-        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
-    }
-
    #[cfg(test)]
    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
        self.last_record_lsn.advance(new_lsn);
@@ -5339,9 +5816,8 @@ impl Timeline {
        for (key, lsn, val) in deltas.data {
            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
        }
-        let delta_layer = delta_layer_writer
-            .finish(deltas.key_range.end, self, ctx)
-            .await?;
+        let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;

        {
            let mut guard = self.layers.write().await;
@@ -5402,6 +5878,8 @@ impl Timeline {
    }
 }

+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
+
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.
@@ -5453,44 +5931,6 @@ enum OpenLayerAction {
 }

 impl<'a> TimelineWriter<'a> {
-    /// Put a new page version that can be constructed from a WAL record
-    ///
-    /// This will implicitly extend the relation, if the page is beyond the
-    /// current end-of-file.
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key, lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
-    }
-
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -5593,18 +6033,52 @@ impl<'a> TimelineWriter<'a> {
    }

    /// Put a batch of keys at the specified Lsns.
-    ///
-    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
    pub(crate) async fn put_batch(
        &mut self,
-        batch: VecMap<Lsn, (Key, Value)>,
+        batch: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        for (lsn, (key, val)) in batch {
-            self.put(key, lsn, &val, ctx).await?
+        if batch.is_empty() {
+            return Ok(());
        }

-        Ok(())
+        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
+        let batch_max_lsn = serialized_batch.max_lsn;
+        let buf_size: u64 = serialized_batch.raw.len() as u64;
+
+        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
+        let layer = self
+            .handle_open_layer_action(batch_max_lsn, action, ctx)
+            .await?;
+
+        let res = layer.put_batch(&serialized_batch, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(batch_max_lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
+        }
+
+        res
+    }
+
+    #[cfg(test)]
+    /// Test helper, for tests that would like to poke individual values without composing a batch
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.put_batch(vec![(key, lsn, value.clone())], ctx).await
    }

    pub(crate) async fn delete_batch(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,10 +19,8 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
-use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -43,7 +41,6 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
-use crate::walrecord::NeonWalRecord;

 use utils::lsn::Lsn;

@@ -76,7 +73,6 @@ impl KeyHistoryRetention {
        key: Key,
        delta_writer: &mut Vec<(Key, Lsn, Value)>,
        mut image_writer: Option<&mut ImageLayerWriter>,
-        stat: &mut CompactionStatistics,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
@@ -86,7 +82,6 @@ impl KeyHistoryRetention {
                    let Value::Image(img) = &logs[0].1 else {
                        unreachable!()
                    };
-                    stat.produce_image_key(img);
                    if let Some(image_writer) = image_writer.as_mut() {
                        image_writer.put_image(key, img.clone(), ctx).await?;
                    } else {
@@ -94,111 +89,24 @@ impl KeyHistoryRetention {
                    }
                } else {
                    for (lsn, val) in logs {
-                        stat.produce_key(&val);
                        delta_writer.push((key, lsn, val));
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
-                    stat.produce_key(&val);
                    delta_writer.push((key, lsn, val));
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
-            stat.produce_key(&val);
            delta_writer.push((key, lsn, val));
        }
        Ok(())
    }
 }

-#[derive(Debug, Serialize, Default)]
-struct CompactionStatisticsNumSize {
-    num: u64,
-    size: u64,
-}
-
-#[derive(Debug, Serialize, Default)]
-pub struct CompactionStatistics {
-    delta_layer_visited: CompactionStatisticsNumSize,
-    image_layer_visited: CompactionStatisticsNumSize,
-    delta_layer_produced: CompactionStatisticsNumSize,
-    image_layer_produced: CompactionStatisticsNumSize,
-    num_delta_layer_discarded: usize,
-    num_image_layer_discarded: usize,
-    num_unique_keys_visited: usize,
-    wal_keys_visited: CompactionStatisticsNumSize,
-    image_keys_visited: CompactionStatisticsNumSize,
-    wal_produced: CompactionStatisticsNumSize,
-    image_produced: CompactionStatisticsNumSize,
-}
-
-impl CompactionStatistics {
-    fn estimated_size_of_value(val: &Value) -> usize {
-        match val {
-            Value::Image(img) => img.len(),
-            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
-            _ => std::mem::size_of::<NeonWalRecord>(),
-        }
-    }
-    fn estimated_size_of_key() -> usize {
-        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
-    }
-    fn visit_delta_layer(&mut self, size: u64) {
-        self.delta_layer_visited.num += 1;
-        self.delta_layer_visited.size += size;
-    }
-    fn visit_image_layer(&mut self, size: u64) {
-        self.image_layer_visited.num += 1;
-        self.image_layer_visited.size += size;
-    }
-    fn on_unique_key_visited(&mut self) {
-        self.num_unique_keys_visited += 1;
-    }
-    fn visit_wal_key(&mut self, val: &Value) {
-        self.wal_keys_visited.num += 1;
-        self.wal_keys_visited.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn visit_image_key(&mut self, val: &Value) {
-        self.image_keys_visited.num += 1;
-        self.image_keys_visited.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn produce_key(&mut self, val: &Value) {
-        match val {
-            Value::Image(img) => self.produce_image_key(img),
-            Value::WalRecord(_) => self.produce_wal_key(val),
-        }
-    }
-    fn produce_wal_key(&mut self, val: &Value) {
-        self.wal_produced.num += 1;
-        self.wal_produced.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn produce_image_key(&mut self, val: &Bytes) {
-        self.image_produced.num += 1;
-        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn discard_delta_layer(&mut self) {
-        self.num_delta_layer_discarded += 1;
-    }
-    fn discard_image_layer(&mut self) {
-        self.num_image_layer_discarded += 1;
-    }
-    fn produce_delta_layer(&mut self, size: u64) {
-        self.delta_layer_produced.num += 1;
-        self.delta_layer_produced.size += size;
-    }
-    fn produce_image_layer(&mut self, size: u64) {
-        self.image_layer_produced.num += 1;
-        self.image_layer_produced.size += size;
-    }
-}
-
 impl Timeline {
    /// TODO: cancellation
    ///
@@ -210,18 +118,12 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, CompactionError> {
        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, flags, ctx)
+            self.compact_with_gc(cancel, ctx)
                .await
                .map_err(CompactionError::Other)?;
            return Ok(false);
        }

-        if flags.contains(CompactFlags::DryRun) {
-            return Err(CompactionError::Other(anyhow!(
-                "dry-run mode is not supported for legacy compaction for now"
-            )));
-        }
-
        // High level strategy for compaction / image creation:
        //
        // 1. First, calculate the desired "partitioning" of the
@@ -1104,14 +1006,16 @@ impl Timeline {
                        || contains_hole
                    {
                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self, ctx)
-                                .await
-                                .map_err(CompactionError::Other)?,
-                        );
+                        let (desc, path) = writer
+                            .take()
+                            .unwrap()
+                            .finish(prev_key.unwrap().next(), ctx)
+                            .await
+                            .map_err(CompactionError::Other)?;
+                        let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                            .map_err(CompactionError::Other)?;
+
+                        new_layers.push(new_delta);
                        writer = None;

                        if contains_hole {
@@ -1174,12 +1078,13 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(
-                writer
-                    .finish(prev_key.unwrap().next(), self, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?,
-            );
+            let (desc, path) = writer
+                .finish(prev_key.unwrap().next(), ctx)
+                .await
+                .map_err(CompactionError::Other)?;
+            let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                .map_err(CompactionError::Other)?;
+            new_layers.push(new_delta);
        }

        // Sync layers
@@ -1739,7 +1644,6 @@ impl Timeline {
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
        cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        use std::collections::BTreeSet;
@@ -1763,16 +1667,12 @@ impl Timeline {
        )
        .await?;

-        let dry_run = flags.contains(CompactFlags::DryRun);
-
-        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        info!("running enhanced gc bottom-most compaction");

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
        };

-        let mut stat = CompactionStatistics::default();
-
        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -1843,9 +1743,6 @@ impl Timeline {
                let key_range = desc.get_key_range();
                delta_split_points.insert(key_range.start);
                delta_split_points.insert(key_range.end);
-                stat.visit_delta_layer(desc.file_size());
-            } else {
-                stat.visit_image_layer(desc.file_size());
            }
        }
        let mut delta_layers = Vec::new();
@@ -1881,8 +1778,6 @@ impl Timeline {
            tline: &Arc<Timeline>,
            lowest_retain_lsn: Lsn,
            ctx: &RequestContext,
-            stats: &mut CompactionStatistics,
-            dry_run: bool,
            last_batch: bool,
        ) -> anyhow::Result<Option<FlushDeltaResult>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1939,7 +1834,6 @@ impl Timeline {
                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
                    drop(guard);
                    if layer_generation == tline.generation {
-                        stats.discard_delta_layer();
                        // TODO: depending on whether we design this compaction process to run along with
                        // other compactions, there could be layer map modifications after we drop the
                        // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1966,13 +1860,11 @@ impl Timeline {
            for (key, lsn, val) in deltas {
                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
            }
-            stats.produce_delta_layer(delta_layer_writer.size());
-            if dry_run {
-                return Ok(None);
-            }
-            let delta_layer = delta_layer_writer
-                .finish(delta_key.key_range.end, tline, ctx)
+
+            let (desc, path) = delta_layer_writer
+                .finish(delta_key.key_range.end, ctx)
                .await?;
+            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
        }

@@ -2064,13 +1956,6 @@ impl Timeline {
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
        while let Some((key, lsn, val)) = merge_iter.next().await? {
-            if cancel.is_cancelled() {
-                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
-            }
-            match val {
-                Value::Image(_) => stat.visit_image_key(&val),
-                Value::WalRecord(_) => stat.visit_wal_key(&val),
-            }
            if last_key.is_none() || last_key.as_ref() == Some(&key) {
                if last_key.is_none() {
                    last_key = Some(key);
@@ -2078,7 +1963,6 @@ impl Timeline {
                accumulated_values.push((key, lsn, val));
            } else {
                let last_key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited();
                let retention = self
                    .generate_key_retention(
                        *last_key,
@@ -2095,7 +1979,6 @@ impl Timeline {
                        *last_key,
                        &mut delta_values,
                        image_layer_writer.as_mut(),
-                        &mut stat,
                        ctx,
                    )
                    .await?;
@@ -2108,8 +1991,6 @@ impl Timeline {
                        self,
                        lowest_retain_lsn,
                        ctx,
-                        &mut stat,
-                        dry_run,
                        false,
                    )
                    .await?,
@@ -2122,7 +2003,6 @@ impl Timeline {

        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
-        stat.on_unique_key_visited();
        let retention = self
            .generate_key_retention(
                last_key,
@@ -2139,7 +2019,6 @@ impl Timeline {
                last_key,
                &mut delta_values,
                image_layer_writer.as_mut(),
-                &mut stat,
                ctx,
            )
            .await?;
@@ -2152,8 +2031,6 @@ impl Timeline {
                self,
                lowest_retain_lsn,
                ctx,
-                &mut stat,
-                dry_run,
                true,
            )
            .await?,
@@ -2161,28 +2038,12 @@ impl Timeline {
        assert!(delta_values.is_empty(), "unprocessed keys");

        let image_layer = if discard_image_layer {
-            stat.discard_image_layer();
            None
        } else if let Some(writer) = image_layer_writer {
-            stat.produce_image_layer(writer.size());
-            if !dry_run {
-                Some(writer.finish(self, ctx).await?)
-            } else {
-                None
-            }
+            Some(writer.finish(self, ctx).await?)
        } else {
            None
        };
-
-        info!(
-            "gc-compaction statistics: {}",
-            serde_json::to_string(&stat)?
-        );
-
-        if dry_run {
-            return Ok(());
-        }
-
        info!(
            "produced {} delta layers and {} image layers",
            delta_layers.len(),
@@ -2206,7 +2067,6 @@ impl Timeline {
        let mut layer_selection = layer_selection;
        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
        compact_to.extend(image_layer);
-
        // Step 3: Place back to the layer map.
        {
            let mut guard = self.layers.write().await;
@@ -2413,9 +2273,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
            ))
        });

-        let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
-            .await?;
+        let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
+        let new_delta_layer =
+            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;

        self.new_deltas.push(new_delta_layer);
        Ok(())
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -230,8 +230,6 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

-        tenant.gc_block.before_delete(&timeline);
-
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -488,10 +488,12 @@ async fn copy_lsn_prefix(
        // reuse the key instead of adding more holes between layers by using the real
        // highest key in the layer.
        let reused_highest_key = layer.layer_desc().key_range.end;
-        let copied = writer
-            .finish(reused_highest_key, target_timeline, ctx)
+        let (desc, path) = writer
+            .finish(reused_highest_key, ctx)
            .await
            .map_err(CopyDeltaPrefix)?;
+        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
+            .map_err(CopyDeltaPrefix)?;

        tracing::debug!(%layer, %copied, "new layer produced");

--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,10 +122,6 @@ impl CurrentLogicalSize {
            Self::Exact(_) => Accuracy::Exact,
        }
    }
-
-    pub(crate) fn is_exact(&self) -> bool {
-        matches!(self, Self::Exact(_))
-    }
 }

 impl LogicalSize {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr::TaskKind,
-    task_mgr::WALRECEIVER_RUNTIME,
+    pgdatadir_mapping::DatadirModification,
+    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
@@ -342,7 +342,10 @@ pub(super) async fn handle_walreceiver_connection(
                        // Commit every ingest_batch_size records. Even if we filtered out
                        // all records, we still need to call commit to advance the LSN.
                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
+                        if uncommitted_records >= ingest_batch_size
+                            || modification.approx_pending_bytes()
+                                > DatadirModification::MAX_PENDING_BYTES
+                        {
                            WAL_INGEST
                                .records_committed
                                .inc_by(uncommitted_records - filtered_records);
--- a/poetry.lock
+++ b/poetry.lock
@@ -1514,20 +1514,6 @@ files = [
 [package.dependencies]
 six = "*"

-[[package]]
-name = "kafka-python"
-version = "2.0.2"
-description = "Pure Python client for Apache Kafka"
-optional = false
-python-versions = "*"
-files = [
-    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
-    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
-]
-
-[package.extras]
-crc32c = ["crc32c"]
-
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3371,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
+content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,8 @@
 [tool.poetry]
+name = "neon"
+version = "0.1.0"
 description = ""
 authors = []
-package-mode = false

 [tool.poetry.dependencies]
 python = "^3.9"
@@ -41,7 +42,6 @@ httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
-kafka-python = "^2.0.2"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -75,7 +75,6 @@ module = [
    "allure.*",
    "allure_commons.*",
    "allure_pytest.*",
-    "kafka.*",
 ]
 ignore_missing_imports = true

--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -92,7 +92,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                            .push(format!("index_part.json version: {}", index_part.version()))
                    }

-                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
+                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
                    if !newest_versions.any(|ip| ip == &index_part.version()) {
                        info!(
                            "index_part.json version is not latest: {}",
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -4,7 +4,7 @@ use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use futures::StreamExt;
-use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
+use remote_storage::{GenericRemoteStorage, ListingMode};
 use tokio_stream::Stream;

 use crate::{
@@ -276,33 +276,3 @@ pub(crate) fn stream_listing<'a>(
        }
    }
 }
-
-pub(crate) fn stream_listing_generic<'a>(
-    remote_client: &'a GenericRemoteStorage,
-    target: &'a S3Target,
-) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
-    let listing_mode = if target.delimiter.is_empty() {
-        ListingMode::NoDelimiter
-    } else {
-        ListingMode::WithDelimiter
-    };
-    try_stream! {
-        let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
-            remote_client,
-            listing_mode,
-            target,
-        ));
-        while let Some(list) = objects_stream.next().await {
-            let list = list?;
-            if target.delimiter.is_empty() {
-                for key in list.keys {
-                    yield (key.key.clone(), Some(key));
-                }
-            } else {
-                for key in list.prefixes {
-                    yield (key, None);
-                }
-            }
-        }
-    }
-}
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,10 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};

+use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
-use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{error, info, trace};
@@ -14,9 +14,8 @@ use utils::{
 };

 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
-    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
-    TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
 };

 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -107,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
    let timelines = client.query(&query, &[]).await?;
    info!("loaded {} timelines", timelines.len());

-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
    let console_config = ConsoleConfig::from_env()?;
    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);

@@ -120,7 +119,7 @@ pub async fn scan_safekeeper_metadata(
        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
        check_timeline(
-            &remote_client,
+            &s3_client,
            &target,
            &cloud_admin_api_client,
            ttid,
@@ -157,7 +156,7 @@ struct TimelineCheckResult {
 /// errors are logged to stderr; returns Ok(true) if timeline is consistent,
 /// Ok(false) if not, Err if failed to check.
 async fn check_timeline(
-    remote_client: &GenericRemoteStorage,
+    s3_client: &Client,
    root: &RootTarget,
    api_client: &CloudAdminApiClient,
    ttid: TenantTimelineId,
@@ -188,13 +187,12 @@ async fn check_timeline(
    // we need files, so unset it.
    timeline_dir_target.delimiter = String::new();

-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
    while let Some(obj) = stream.next().await {
-        let (key, _obj) = obj?;
+        let obj = obj?;
+        let key = obj.key();

        let seg_name = key
-            .get_path()
-            .as_str()
            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
            .expect("failed to extract segment name");
        expected_segfiles.remove(seg_name);
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -285,9 +285,9 @@ class NeonApiEndpoint:
            self.project_id = project_id
            eps = neon_api.get_endpoints(project_id)["endpoints"]
            self.endpoint_id = eps[0]["id"]
-            self.connstr = neon_api.get_connection_uri(
-                project_id, endpoint_id=self.endpoint_id, pooled=False
-            )["uri"]
+            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
+                "uri"
+            ]
            pw = self.connstr.split("@")[0].split(":")[-1]
            self.pgbench_env = {
                "PGHOST": eps[0]["host"],
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -556,22 +556,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        assert isinstance(res_json, dict)
        return res_json

-    def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc",
-        )
-        log.info(f"Got GC request response code: {res.status_code}")
-        self.verbose_error(res)
-
-    def timeline_unblock_gc(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc",
-        )
-        log.info(f"Got GC request response code: {res.status_code}")
-        self.verbose_error(res)
-
    def timeline_compact(
        self,
        tenant_id: Union[TenantId, TenantShardId],
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -389,10 +389,7 @@ WaitUntilRet = TypeVar("WaitUntilRet")


 def wait_until(
-    number_of_iterations: int,
-    interval: float,
-    func: Callable[[], WaitUntilRet],
-    show_intermediate_error=False,
+    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
 ) -> WaitUntilRet:
    """
    Wait until 'func' returns successfully, without exception. Returns the
@@ -405,8 +402,6 @@ def wait_until(
        except Exception as e:
            log.info("waiting for %s iteration %s failed", func, i + 1)
            last_exception = e
-            if show_intermediate_error:
-                log.info(e)
            time.sleep(interval)
            continue
        return res
--- a/test_runner/logical_repl/README.md
+++ b/test_runner/logical_repl/README.md
@@ -1,22 +0,0 @@
-# Logical replication tests
-
-## Clickhouse
-
-```bash
-export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
-
-docker compose -f clickhouse/docker-compose.yml up -d
-pytest -m remote_cluster -k test_clickhouse
-docker compose -f clickhouse/docker-compose.yml down
-```
-
-## Debezium
-
-```bash
-export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
-
-docker compose -f debezium/docker-compose.yml up -d
-pytest -m remote_cluster -k test_debezium
-docker compose -f debezium/docker-compose.yml down
-
-```
--- a/test_runner/logical_repl/clickhouse/docker-compose.yml
+++ b/test_runner/logical_repl/clickhouse/docker-compose.yml
@@ -1,9 +0,0 @@
-services:
-  clickhouse:
-    image: clickhouse/clickhouse-server
-    user: "101:101"
-    container_name: clickhouse
-    hostname: clickhouse
-    ports:
-      - 127.0.0.1:8123:8123
-      - 127.0.0.1:9000:9000
--- a/test_runner/logical_repl/debezium/docker-compose.yml
+++ b/test_runner/logical_repl/debezium/docker-compose.yml
@@ -1,24 +0,0 @@
-services:
-  zookeeper:
-    image: quay.io/debezium/zookeeper:2.7
-  kafka:
-    image: quay.io/debezium/kafka:2.7
-    environment:
-      ZOOKEEPER_CONNECT: "zookeeper:2181"
-      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
-      KAFKA_BROKER_ID: 1
-      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-      KAFKA_JMX_PORT: 9991
-    ports:
-      - 127.0.0.1:9092:9092
-  debezium:
-    image: quay.io/debezium/connect:2.7
-    environment:
-      BOOTSTRAP_SERVERS: kafka:9092
-      GROUP_ID: 1
-      CONFIG_STORAGE_TOPIC: debezium-config
-      OFFSET_STORAGE_TOPIC: debezium-offset
-      STATUS_STORAGE_TOPIC: debezium-status
-      DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
-    ports:
-      - 127.0.0.1:8083:8083
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -1,189 +0,0 @@
-"""
-Test the logical replication in Neon with Debezium as a consumer
-"""
-
-import json
-import os
-import time
-
-import psycopg2
-import pytest
-import requests
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import RemotePostgres
-from fixtures.utils import wait_until
-from kafka import KafkaConsumer
-
-
-class DebeziumAPI:
-    """
-    The class for Debezium API calls
-    """
-
-    def __init__(self):
-        self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1"
-        self.__base_url = f"http://{self.__host}:8083"
-        self.__connectors_url = f"{self.__base_url}/connectors"
-
-    def __request(self, method, addurl="", **kwargs):
-        return requests.request(
-            method,
-            self.__connectors_url + addurl,
-            headers={"Accept": "application/json", "Content-type": "application/json"},
-            timeout=60,
-            **kwargs,
-        )
-
-    def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str):
-        """
-        Create a Postgres connector in debezium
-        """
-        conn_options = remote_pg.conn_options()
-        payload = {
-            "name": dbz_conn_name,
-            "config": {
-                "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
-                "tasks.max": "1",
-                "database.hostname": conn_options["host"],
-                "database.port": "5432",
-                "database.user": conn_options["user"],
-                "database.password": conn_options["password"],
-                "database.dbname": conn_options["dbname"],
-                "plugin.name": "pgoutput",
-                "topic.prefix": "dbserver1",
-                "schema.include.list": "inventory",
-            },
-        }
-        return self.__request("POST", json=payload)
-
-    def list_connectors(self):
-        """
-        Returns a list of all connectors existent in Debezium.
-        """
-        resp = self.__request("GET")
-        assert resp.ok
-        return json.loads(resp.text)
-
-    def del_connector(self, connector):
-        """
-        Deletes the specified connector
-        """
-        return self.__request("DELETE", f"/{connector}")
-
-
-@pytest.fixture(scope="function")
-def debezium(remote_pg: RemotePostgres):
-    """
-    Prepare the Debezium API handler, connection
-    """
-    conn = psycopg2.connect(remote_pg.connstr())
-    cur = conn.cursor()
-    cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE")
-    cur.execute("CREATE SCHEMA inventory")
-    cur.execute(
-        "CREATE TABLE inventory.customers ("
-        "id SERIAL NOT NULL PRIMARY KEY,"
-        "first_name character varying(255) NOT NULL,"
-        "last_name character varying(255) NOT NULL,"
-        "email character varying(255) NOT NULL)"
-    )
-    conn.commit()
-    dbz = DebeziumAPI()
-    assert len(dbz.list_connectors()) == 0
-    dbz_conn_name = "inventory-connector"
-    resp = dbz.create_pg_connector(remote_pg, dbz_conn_name)
-    log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
-    assert resp.status_code == 201
-    assert len(dbz.list_connectors()) == 1
-    consumer = KafkaConsumer(
-        "dbserver1.inventory.customers",
-        bootstrap_servers=["kafka:9092"],
-        auto_offset_reset="earliest",
-        enable_auto_commit=False,
-    )
-    yield conn, consumer
-    resp = dbz.del_connector(dbz_conn_name)
-    assert resp.status_code == 204
-
-
-def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None:
-    """
-    Gets the message from Kafka and checks its validity
-    Arguments:
-        consumer: the consumer object
-        ts_ms:    timestamp in milliseconds of the change of db, the corresponding message must have
-                  the later timestamp
-        before:   a dictionary, if not None, the before field from the kafka message must
-                  have the same values for the same keys
-        after:    a dictionary, if not None, the after field from the kafka message must
-                  have the same values for the same keys
-    """
-    msg = consumer.poll()
-    assert msg, "Empty message"
-    for val in msg.values():
-        r = json.loads(val[-1].value)
-        log.info(r["payload"])
-        assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp"
-        for param, pname in ((before, "before"), (after, "after")):
-            if param is not None:
-                for k, v in param.items():
-                    assert r["payload"][pname][k] == v, f"{pname} mismatches"
-
-
-@pytest.mark.remote_cluster
-def test_debezium(debezium):
-    """
-    Test the logical replication having Debezium as a subscriber
-    """
-    conn, consumer = debezium
-    cur = conn.cursor()
-    ts_ms = time.time() * 1000
-    log.info("Insert 1 ts_ms: %s", ts_ms)
-    cur.execute(
-        "insert into inventory.customers (first_name, last_name, email) "
-        "values ('John', 'Dow','johndow@example.com')"
-    )
-    conn.commit()
-    wait_until(
-        100,
-        0.5,
-        lambda: get_kafka_msg(
-            consumer,
-            ts_ms,
-            after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
-        ),
-        show_intermediate_error=True,
-    )
-    ts_ms = time.time() * 1000
-    log.info("Insert 2 ts_ms: %s", ts_ms)
-    cur.execute(
-        "insert into inventory.customers (first_name, last_name, email) "
-        "values ('Alex', 'Row','alexrow@example.com')"
-    )
-    conn.commit()
-    wait_until(
-        100,
-        0.5,
-        lambda: get_kafka_msg(
-            consumer,
-            ts_ms,
-            after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
-        ),
-        show_intermediate_error=True,
-    )
-    ts_ms = time.time() * 1000
-    log.info("Update ts_ms: %s", ts_ms)
-    cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
-    conn.commit()
-    wait_until(
-        100,
-        0.5,
-        lambda: get_kafka_msg(
-            consumer,
-            ts_ms,
-            after={"first_name": "Alexander"},
-        ),
-        show_intermediate_error=True,
-    )
-    time.sleep(3)
-    cur.execute("select 1")
--- a/test_runner/logical_repl/test_clickhouse.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -1,9 +1,8 @@
 """
-Test the logical replication in Neon with ClickHouse as a consumer
+Test the logical replication in Neon with the different consumers
 """

 import hashlib
-import os
 import time

 import clickhouse_connect
@@ -40,15 +39,22 @@ def test_clickhouse(remote_pg: RemotePostgres):
    """
    Test the logical replication having ClickHouse as a client
    """
-    clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1"
    conn_options = remote_pg.conn_options()
-    conn = psycopg2.connect(remote_pg.connstr())
+    for _ in range(5):
+        try:
+            conn = psycopg2.connect(remote_pg.connstr())
+        except psycopg2.OperationalError as perr:
+            log.debug(perr)
+            time.sleep(1)
+        else:
+            break
+        raise TimeoutError
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS table1")
    cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
    cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
    conn.commit()
-    client = clickhouse_connect.get_client(host=clickhouse_host)
+    client = clickhouse_connect.get_client(host="clickhouse")
    client.command("SET allow_experimental_database_materialized_postgresql=1")
    client.command(
        "CREATE DATABASE db1_postgres ENGINE = "
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,6 +1,5 @@
 from contextlib import closing

-import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
@@ -17,7 +16,6 @@ from fixtures.pg_version import PgVersion
 # 3. Disk space used
 # 4. Peak memory usage
 #
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124")
 def test_bulk_insert(neon_with_baseline: PgCompare):
    env = neon_with_baseline

--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -100,32 +100,24 @@ def test_subscriber_lag(
    pub_connstr = benchmark_project_pub.connstr
    sub_connstr = benchmark_project_sub.connstr

-    if benchmark_project_pub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    if benchmark_project_sub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)

    pub_conn = psycopg2.connect(pub_connstr)
    sub_conn = psycopg2.connect(sub_connstr)
    pub_conn.autocommit = True
    sub_conn.autocommit = True
    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
-        pub_exists = len(pub_cur.fetchall()) != 0
+        if benchmark_project_pub.is_new:
+            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")

-        if not pub_exists:
-            pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history")
-
-        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
-        sub_exists = len(sub_cur.fetchall()) != 0
-        if not sub_exists:
+        if benchmark_project_sub.is_new:
            sub_cur.execute("truncate table pgbench_accounts")
            sub_cur.execute("truncate table pgbench_history")

-            sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1")
+            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")

        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
-
    pub_conn.close()
    sub_conn.close()

@@ -203,15 +195,10 @@ def test_publisher_restart(
    pub_conn.autocommit = True
    sub_conn.autocommit = True
    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
-        pub_exists = len(pub_cur.fetchall()) != 0
-
-        if not pub_exists:
+        if benchmark_project_pub.is_new:
            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")

-        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
-        sub_exists = len(sub_cur.fetchall()) != 0
-        if not sub_exists:
+        if benchmark_project_sub.is_new:
            sub_cur.execute("truncate table pgbench_accounts")
            sub_cur.execute("truncate table pgbench_history")

--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -1,67 +0,0 @@
-import time
-
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
-from fixtures.pageserver.utils import wait_timeline_detail_404
-
-
-def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
-    )
-    ps = env.pageserver
-    http = ps.http_client()
-
-    foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
-
-    gc_active_line = ".* gc_loop.*: [12] timelines need GC"
-    gc_skipped_line = ".* gc_loop.*: Skipping GC: .*"
-    init_gc_skipped = ".*: initialized with gc blocked.*"
-
-    tenant_before = http.tenant_status(env.initial_tenant)
-
-    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line)
-
-    assert ps.log_contains(gc_skipped_line, offset) is None
-
-    http.timeline_block_gc(env.initial_tenant, foo_branch)
-
-    tenant_after = http.tenant_status(env.initial_tenant)
-    assert tenant_before != tenant_after
-    gc_blocking = tenant_after["gc_blocking"]
-    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
-
-    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
-
-    ps.restart()
-    ps.quiesce_tenants()
-
-    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
-
-    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
-
-    # deletion unblocks gc
-    http.timeline_delete(env.initial_tenant, foo_branch)
-    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
-
-    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
-
-    http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
-
-    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
-
-    # removing the manual block also unblocks gc
-    http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
-
-    wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
-
-
-def wait_for_another_gc_round():
-    time.sleep(2)
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -936,9 +936,6 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    # just make sure this doesn't hit an assertion
-    client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True)
-
    # load in some data
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    endpoint.safe_psql_many(
Author	SHA1	Message	Date
John Spray	5664eadb17	dirty	2024-08-05 18:39:26 +00:00
John Spray	1a99aa4834	utils: use SmallVec in VecMap	2024-08-05 18:22:41 +00:00
John Spray	35df89dcfe	update ingest_bench	2024-08-05 17:57:41 +00:00
John Spray	28280683ab	enable bulk_ingest	2024-08-05 17:49:02 +00:00
John Spray	eed100b21e	wip	2024-08-05 17:49:02 +00:00
John Spray	4bd26c54e5	wip	2024-08-05 17:49:02 +00:00
John Spray	e461a711d0	Soft limit on the size of monolithic serialization stage	2024-08-05 17:49:02 +00:00
John Spray	93dacd75aa	Refactor InMemoryLayer put_batch code	2024-08-05 17:49:02 +00:00
John Spray	06428e856e	Remove unused singular puts	2024-08-05 17:49:01 +00:00
John Spray	fce68fe84e	pageserver: avoid a no-longer-needed sort during ingest	2024-08-05 17:47:58 +00:00
John Spray	fd60904376	pageserver: batch ephemeral layer writes during ingest	2024-08-05 17:47:58 +00:00
John Spray	bf3e767b35	update split_writer for merge	2024-08-05 17:43:08 +00:00
John Spray	513cafd72b	Merge remote-tracking branch 'upstream/main' into jcsp/ingest-bench	2024-08-05 17:38:10 +00:00
John Spray	c2d5395a00	clean up temp dir	2024-08-05 12:36:15 +00:00
John Spray	d152a57c29	s/field3/field6/	2024-08-05 12:23:15 +00:00
John Spray	a8be0f3376	add a doc comment	2024-08-05 12:18:06 +00:00
John Spray	5dcfe1c4b8	pageserver: downgrade an assertion to debug	2024-08-01 15:43:21 +00:00
John Spray	ae7d635098	pageserver: add ingest bench	2024-08-01 15:43:21 +00:00
John Spray	137cbb4db4	pageserver: refactor DeltaLayerWriter to not need a Timeline	2024-08-01 15:43:21 +00:00
John Spray	74eda0b0b7	pageserver: make bench'able methods public	2024-08-01 15:00:01 +00:00