Revert "refactor(page_service): Timeline gate guard holding + cancellation + shutdown (#8339 )"

This reverts commit 4e3b70e308.
Lower level for timeline cancellations during gc (#8626 )
2026-05-15 20:20:38 +00:00 · 2024-08-07 19:57:33 +08:00 · 2024-08-07 09:29:52 +02:00 · 2024-08-07 09:14:26 +02:00 · 2024-08-06 18:55:42 +01:00 · 2024-08-06 16:39:40 +00:00
69 changed files with 2274 additions and 2788 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,6 +8,8 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
+  - BENCHMARK_PROJECT_ID_PUB
+  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,7 +147,7 @@ jobs:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -168,7 +168,7 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    - name: Run benchmark
+    - name: Run Logical Replication benchmarks
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -176,12 +176,15 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
+        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}

-    - name: Run benchmark
+    - name: Run Physical Replication benchmarks
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,7 +66,31 @@ jobs:
        ports:
          - 9000:9000
          - 8123:8123
-
+      zookeeper:
+        image: quay.io/debezium/zookeeper:2.7
+        ports:
+          - 2181:2181
+      kafka:
+        image: quay.io/debezium/kafka:2.7
+        env:
+          ZOOKEEPER_CONNECT: "zookeeper:2181"
+          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+          KAFKA_BROKER_ID: 1
+          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+          KAFKA_JMX_PORT: 9991
+        ports:
+          - 9092:9092
+      debezium:
+        image: quay.io/debezium/connect:2.7
+        env:
+          BOOTSTRAP_SERVERS: kafka:9092
+          GROUP_ID: 1
+          CONFIG_STORAGE_TOPIC: debezium-config
+          OFFSET_STORAGE_TOPIC: debezium-offset
+          STATUS_STORAGE_TOPIC: debezium-status
+          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+        ports:
+          - 8083:8083
    steps:
      - uses: actions/checkout@v4

--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,10 +10,6 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6762,7 +6762,6 @@ dependencies = [
 "serde_path_to_error",
 "serde_with",
 "signal-hook",
- "smallvec",
 "strum",
 "strum_macros",
 "thiserror",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
            cleanup
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x

-cd /ext-src
+cd /ext-src || exit 2
 FAILED=
-LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d ${d} ] || continue
+       [ -d "${d}" ] || continue
    psql -c "select 1" >/dev/null || break
-       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo ${FAILED}
+echo "${FAILED}"
 exit 1
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,6 +637,13 @@ pub struct TenantInfo {
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
    pub generation: u32,
+
+    /// Opaque explanation if gc is being blocked.
+    ///
+    /// Only looked up for the individual tenant detail, not the listing. This is purely for
+    /// debugging, not included in openapi.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_blocking: Option<String>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -1427,6 +1434,7 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
+            gc_blocking: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1449,6 +1457,7 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
+            gc_blocking: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -35,7 +35,6 @@ routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
-smallvec.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,15 +1,11 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-use smallvec::SmallVec;
-
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum VecMapOrdering {
    Greater,
    GreaterOrEqual,
 }

-const INLINE_ELEMENTS: usize = 1;
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
@@ -17,7 +13,7 @@ const INLINE_ELEMENTS: usize = 1;
 /// during `VecMap` construction.
 #[derive(Clone, Debug)]
 pub struct VecMap<K, V> {
-    data: SmallVec<[(K, V); INLINE_ELEMENTS]>,
+    data: Vec<(K, V)>,
    ordering: VecMapOrdering,
 }

@@ -41,14 +37,14 @@ pub enum VecMapError {
 impl<K: Ord, V> VecMap<K, V> {
    pub fn new(ordering: VecMapOrdering) -> Self {
        Self {
-            data: Default::default(),
+            data: Vec::new(),
            ordering,
        }
    }

    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
        Self {
-            data: SmallVec::with_capacity(capacity),
+            data: Vec::with_capacity(capacity),
            ordering,
        }
    }
@@ -99,10 +95,6 @@ impl<K: Ord, V> VecMap<K, V> {
        Ok(delta_size)
    }

-    pub fn append_fast(&mut self, key: K, value: V) {
-        self.data.push((key, value))
-    }
-
    /// Update the maximum key value pair or add a new key value pair to the map.
    /// If `key` is not respective of the `self` ordering no updates or additions
    /// will occur and `InvalidKey` error will be returned.
@@ -143,11 +135,11 @@ impl<K: Ord, V> VecMap<K, V> {

        (
            VecMap {
-                data: SmallVec::from(&self.data[..split_idx]),
+                data: self.data[..split_idx].to_vec(),
                ordering: self.ordering,
            },
            VecMap {
-                data: SmallVec::from(&self.data[split_idx..]),
+                data: self.data[split_idx..].to_vec(),
                ordering: self.ordering,
            },
        )
@@ -194,10 +186,7 @@ impl<K: Ord, V> VecMap<K, V> {
    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
-    fn instrument_vec_op(
-        &mut self,
-        op: impl FnOnce(&mut SmallVec<[(K, V); INLINE_ELEMENTS]>),
-    ) -> usize {
+    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
        let old_cap = self.data.capacity();
        op(&mut self.data);
        let new_cap = self.data.capacity();
@@ -237,7 +226,7 @@ impl<K: Ord, V> VecMap<K, V> {

 impl<K: Ord, V> IntoIterator for VecMap<K, V> {
    type Item = (K, V);
-    type IntoIter = smallvec::IntoIter<[(K, V); INLINE_ELEMENTS]>;
+    type IntoIter = std::vec::IntoIter<(K, V)>;

    fn into_iter(self) -> Self::IntoIter {
        self.data.into_iter()
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -10,7 +10,7 @@ use pageserver::{
    page_cache,
    repository::Value,
    task_mgr::TaskKind,
-    tenant::storage_layer::{InMemoryLayer, SerializedBatch},
+    tenant::storage_layer::InMemoryLayer,
    virtual_file::{self, api::IoEngineKind},
 };
 use pageserver_api::{key::Key, shard::TenantShardId};
@@ -63,15 +63,12 @@ async fn ingest(

    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;

-    let value = Value::Image(Bytes::from(vec![0u8; put_size]));
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
    let ctx = RequestContext::new(
        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
        pageserver::context::DownloadBehavior::Download,
    );

-    let batch_pages = 10000;
-    let mut batch_values = vec![];
-
    for i in 0..put_count {
        lsn += put_size as u64;

@@ -94,19 +91,7 @@ async fn ingest(
            }
        }

-        batch_values.push((key, lsn, value.clone()));
-
-        if batch_values.len() >= batch_pages {
-            let write_batch = std::mem::take(&mut batch_values);
-            let batch = SerializedBatch::from_values(write_batch);
-
-            layer.put_batch(&batch, &ctx).await?;
-        }
-    }
-    if !batch_values.is_empty() {
-        let batch = SerializedBatch::from_values(vec![(key, lsn, value.clone())]);
-
-        layer.put_batch(&batch, &ctx).await?;
+        layer.put_value(key, lsn, &data, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -29,9 +31,11 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
-    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -590,13 +594,30 @@ fn start_pageserver(

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -624,7 +645,7 @@ fn start_pageserver(
            shutdown_pageserver.take();
            pageserver::shutdown_pageserver(
                http_endpoint_listener,
-                page_service,
+                libpq_listener,
                consumption_metrics_tasks,
                disk_usage_eviction_task,
                &tenant_manager,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,6 +308,45 @@ paths:
            application/json:
              schema:
                type: string
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently add a gc blocking at the tenant level because of this timeline
+      responses:
+        "200":
+          description: OK
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently remove a tenant level gc blocking for this timeline
+      responses:
+        "200":
+          description: OK
+
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => {
                ApiError::ResourceUnavailable(format!("{}", e).into())
            }
-            GetActiveTenantError::SwitchedTenant => {
-                // in our HTTP handlers, this error doesn't happen
-                // TODO: separate error types
-                ApiError::ResourceUnavailable("switched tenant".into())
-            }
        }
    }
 }
@@ -935,6 +930,7 @@ async fn tenant_list_handler(
            generation: (*gen)
                .into()
                .expect("Tenants are always attached with a generation"),
+            gc_blocking: None,
        })
        .collect::<Vec<TenantInfo>>();

@@ -986,6 +982,7 @@ async fn tenant_status(
                    .generation()
                    .into()
                    .expect("Tenants are always attached with a generation"),
+                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1226,6 +1223,72 @@ async fn evict_timeline_layer_handler(
    }
 }

+async fn timeline_gc_blocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, true).await
+}
+
+async fn timeline_gc_unblocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, false).await
+}
+
+/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
+///
+/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
+async fn block_or_unblock_gc(
+    request: Request<Body>,
+    block: bool,
+) -> Result<Response<Body>, ApiError> {
+    use crate::tenant::{
+        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
+    };
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+
+    let fut = async {
+        if block {
+            timeline.block_gc(&tenant).await.map(|_| ())
+        } else {
+            timeline.unblock_gc(&tenant).await
+        }
+    };
+
+    let span = tracing::info_span!(
+        "block_or_unblock_gc",
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        timeline_id = %timeline_id,
+        block = block,
+    );
+
+    let res = fut.instrument(span).await;
+
+    res.map_err(|e| {
+        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
+            ApiError::ShuttingDown
+        } else {
+            ApiError::InternalServerError(e)
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
    inputs: ModelInputs,
@@ -2904,6 +2967,14 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
+            |r| api_handler(r, timeline_gc_blocking_handler),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
+            |r| api_handler(r, timeline_gc_unblocking_handler),
+        )
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -32,6 +32,7 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
@@ -64,6 +65,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +79,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
-    page_service: page_service::Listener,
+    libpq_listener: LibpqEndpointListener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -162,8 +164,8 @@ pub async fn shutdown_pageserver(

    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    let remaining_connections = timed(
-        page_service.stop_accepting(),
+    timed(
+        libpq_listener.0.shutdown(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -181,7 +183,7 @@ pub async fn shutdown_pageserver(
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
-        remaining_connections.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,6 +15,7 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -36,6 +37,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
+use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -172,7 +174,6 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
-            pending_bytes: 0,
            lsn,
        }
    }
@@ -1057,26 +1058,14 @@ pub struct DatadirModification<'a> {
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
-
-    /// An **approximation** of how large our EphemeralFile write will be when committed.
-    pending_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
-    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
-    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
-    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
-    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
-
    /// Get the current lsn
    pub(crate) fn get_lsn(&self) -> Lsn {
        self.lsn
    }

-    pub(crate) fn approx_pending_bytes(&self) -> usize {
-        self.pending_bytes
-    }
-
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1804,12 +1793,11 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
-            let mut write_batch = Vec::new();
            for (lsn, value) in values {
                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key, lsn, value));
+                    writer.put(key, lsn, &value, ctx).await?;
                } else {
                    retained_pending_updates
                        .entry(key)
@@ -1817,11 +1805,9 @@ impl<'a> DatadirModification<'a> {
                        .push((lsn, value));
                }
            }
-            writer.put_batch(write_batch, ctx).await?;
        }

        self.pending_updates = retained_pending_updates;
-        self.pending_bytes = 0;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1847,20 +1833,17 @@ impl<'a> DatadirModification<'a> {
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(Key, Lsn, Value)> = self
-                .pending_updates
-                .drain()
-                .flat_map(|(key, values)| {
-                    values
-                        .into_iter()
-                        .map(move |(lsn, value)| (key, lsn, value))
-                })
-                .collect::<Vec<_>>();
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
+                self.pending_updates
+                    .drain()
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
+                VecMapOrdering::GreaterOrEqual,
+            );

-            writer.put_batch(batch, ctx).await?;
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1885,8 +1868,6 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

-        self.pending_bytes = 0;
-
        Ok(())
    }

@@ -1937,10 +1918,6 @@ impl<'a> DatadirModification<'a> {
                return;
            }
        }
-        self.pending_bytes += match &val {
-            Value::Image(inner) => inner.len(),
-            Value::WalRecord(_) => 100, // Rough approximation of typical serialized WalRecord size.
-        };
        values.push((self.lsn, val));
    }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -148,6 +148,7 @@ pub(crate) mod timeline;

 pub mod size;

+mod gc_block;
 pub(crate) mod throttle;

 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -303,6 +304,12 @@ pub struct Tenant {
    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

+    /// `index_part.json` based gc blocking reason tracking.
+    ///
+    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
+    /// proceeding.
+    pub(crate) gc_block: gc_block::GcBlock,
+
    l0_flush_global_state: L0FlushGlobalState,
 }

@@ -440,8 +447,6 @@ impl WalRedoManager {

 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
-    #[error("Timeline is shutting down")]
-    ShuttingDown,
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
        tenant_id: TenantShardId,
@@ -1036,6 +1041,8 @@ impl Tenant {
            }
        }

+        let mut gc_blocks = HashMap::new();
+
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
@@ -1045,6 +1052,16 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

+            if let Some(blocking) = index_part.gc_blocking.as_ref() {
+                // could just filter these away, but it helps while testing
+                anyhow::ensure!(
+                    !blocking.reasons.is_empty(),
+                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
+                );
+                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
+                assert!(prev.is_none());
+            }
+
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
@@ -1089,6 +1106,8 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

+        self.gc_block.set_scanned(gc_blocks);
+
        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
@@ -1679,6 +1698,14 @@ impl Tenant {
            }
        }

+        let _guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(reasons) => {
+                info!("Skipping GC: {reasons}");
+                return Ok(GcResult::default());
+            }
+        };
+
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }
@@ -2691,6 +2718,7 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
+            gc_block: Default::default(),
            l0_flush_global_state,
        }
    }
@@ -4092,7 +4120,7 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
+    use std::collections::{BTreeMap, BTreeSet};

    use super::*;
    use crate::keyspace::KeySpaceAccum;
@@ -4767,7 +4795,7 @@ mod tests {
        lsn: Lsn,
        repeat: usize,
        key_count: usize,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
        let compact = true;
        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
    }
@@ -4780,7 +4808,9 @@ mod tests {
        repeat: usize,
        key_count: usize,
        compact: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
+
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;

@@ -4801,6 +4831,7 @@ mod tests {
                        ctx,
                    )
                    .await?;
+                inserted.entry(test_key).or_default().insert(lsn);
                writer.finish_write(lsn);
                drop(writer);

@@ -4825,7 +4856,7 @@ mod tests {
            assert_eq!(res.layers_removed, 0, "this never removes anything");
        }

-        Ok(())
+        Ok(inserted)
    }

    //
@@ -4872,7 +4903,7 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
        guard.layer_map().dump(true, &ctx).await?;
@@ -4933,9 +4964,39 @@ mod tests {
                    &ctx,
                )
                .await;
-            tline
-                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
-                .await;
+
+            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
+            let mut expect_missing = false;
+            let mut key = read.start().unwrap();
+            while key != read.end().unwrap() {
+                if let Some(lsns) = inserted.get(&key) {
+                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
+                    match expected_lsn {
+                        Some(lsn) => {
+                            expected_lsns.insert(key, *lsn);
+                        }
+                        None => {
+                            expect_missing = true;
+                            break;
+                        }
+                    }
+                } else {
+                    expect_missing = true;
+                    break;
+                }
+
+                key = key.next();
+            }
+
+            if expect_missing {
+                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
+            } else {
+                for (key, image) in vectored_res? {
+                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
+                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
+                    assert_eq!(image?, expected_image);
+                }
+            }
        }

        Ok(())
@@ -4985,10 +5046,6 @@ mod tests {
            )
            .await;

-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
        let images = vectored_res?;
        assert!(images.is_empty());
        Ok(())
@@ -6899,7 +6956,10 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
@@ -6993,7 +7053,10 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        Ok(())
    }
@@ -7327,7 +7390,10 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        for idx in 0..10 {
            assert_eq!(
@@ -7353,7 +7419,10 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        Ok(())
    }
@@ -7898,11 +7967,28 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await;

        // compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await;

        // increase GC horizon and compact again
@@ -7912,11 +7998,17 @@ mod tests {
            guard.cutoffs.time = Lsn(0x38);
            guard.cutoffs.space = Lsn(0x38);
        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result

        // not increasing the GC horizon and compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await;

        Ok(())
@@ -8097,7 +8189,10 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        branch_tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        verify_result().await;

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -79,8 +79,6 @@ impl EphemeralFile {
        self.rw.read_blk(blknum, ctx).await
    }

-    #[cfg(test)]
-    // This is a test helper: outside of tests, we are always written do via a pre-serialized batch.
    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
@@ -88,28 +86,17 @@ impl EphemeralFile {
    ) -> Result<u64, io::Error> {
        let pos = self.rw.bytes_written();

-        let mut len_bytes = std::io::Cursor::new(Vec::new());
-        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
-            srcbuf.len(),
-            &mut len_bytes,
-        );
-        let len_bytes = len_bytes.into_inner();
-
        // Write the length field
-        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+        if srcbuf.len() < 0x80 {
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];

-        // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
-
-        Ok(pos)
-    }
-
-    pub(crate) async fn write_raw(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        } else {
+            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
+            len_buf[0] |= 0x80;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        }

        // Write the payload
        self.rw.write_all_borrowed(srcbuf, ctx).await?;
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -0,0 +1,213 @@
+use std::collections::HashMap;
+
+use utils::id::TimelineId;
+
+use super::remote_timeline_client::index::GcBlockingReason;
+
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+
+#[derive(Default)]
+pub(crate) struct GcBlock {
+    /// The timelines which have current reasons to block gc.
+    ///
+    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
+    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
+    reasons: std::sync::Mutex<Storage>,
+    blocking: tokio::sync::Mutex<()>,
+}
+
+impl GcBlock {
+    /// Start another gc iteration.
+    ///
+    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
+    /// it's ending, or if not currently possible, a value describing the reasons why not.
+    ///
+    /// Cancellation safe.
+    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+        let reasons = {
+            let g = self.reasons.lock().unwrap();
+
+            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
+            // tests, we use everything. we should warn if the gc has been consecutively blocked
+            // for more than 1h (within single tenant session?).
+            BlockingReasons::clean_and_summarize(g)
+        };
+
+        if let Some(reasons) = reasons {
+            Err(reasons)
+        } else {
+            Ok(Guard {
+                _inner: self.blocking.lock().await,
+            })
+        }
+    }
+
+    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
+        let g = self.reasons.lock().unwrap();
+
+        BlockingReasons::summarize(&g)
+    }
+
+    /// Start blocking gc for this one timeline for the given reason.
+    ///
+    /// This is not a guard based API but instead it mimics set API. The returned future will not
+    /// resolve until an existing gc round has completed.
+    ///
+    /// Returns true if this block was new, false if gc was already blocked for this reason.
+    ///
+    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
+    /// keep the gc blocking reason.
+    pub(crate) async fn insert(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<bool> {
+        let (added, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            let set = g.entry(timeline.timeline_id).or_default();
+            let added = set.insert(reason);
+
+            // LOCK ORDER: intentionally hold the lock, see self.reasons.
+            let uploaded = timeline
+                .remote_client
+                .schedule_insert_gc_block_reason(reason)?;
+
+            (added, uploaded)
+        };
+
+        uploaded.await?;
+
+        // ensure that any ongoing gc iteration has completed
+        drop(self.blocking.lock().await);
+
+        Ok(added)
+    }
+
+    /// Remove blocking gc for this one timeline and the given reason.
+    pub(crate) async fn remove(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<()> {
+        use std::collections::hash_map::Entry;
+
+        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (remaining_blocks, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            match g.entry(timeline.timeline_id) {
+                Entry::Occupied(mut oe) => {
+                    let set = oe.get_mut();
+                    set.remove(reason);
+                    if set.is_empty() {
+                        oe.remove();
+                    }
+                }
+                Entry::Vacant(_) => {
+                    // we must still do the index_part.json update regardless, in case we had earlier
+                    // been cancelled
+                }
+            }
+
+            let remaining_blocks = g.len();
+
+            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
+            let uploaded = timeline
+                .remote_client
+                .schedule_remove_gc_block_reason(reason)?;
+
+            (remaining_blocks, uploaded)
+        };
+        uploaded.await?;
+
+        // no need to synchronize with gc iteration again
+
+        if remaining_blocks > 0 {
+            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
+        } else {
+            tracing::info!("gc is now unblocked for the tenant");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
+        let unblocked = {
+            let mut g = self.reasons.lock().unwrap();
+            if g.is_empty() {
+                return;
+            }
+
+            g.remove(&timeline.timeline_id);
+
+            BlockingReasons::clean_and_summarize(g).is_none()
+        };
+
+        if unblocked {
+            tracing::info!("gc is now unblocked following deletion");
+        }
+    }
+
+    /// Initialize with the non-deleted timelines of this tenant.
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
+        let mut g = self.reasons.lock().unwrap();
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+
+        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
+            tracing::info!(summary=?reasons, "initialized with gc blocked");
+        }
+    }
+}
+
+pub(super) struct Guard<'a> {
+    _inner: tokio::sync::MutexGuard<'a, ()>,
+}
+
+#[derive(Debug)]
+pub(crate) struct BlockingReasons {
+    timelines: usize,
+    reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+impl std::fmt::Display for BlockingReasons {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
+        )
+    }
+}
+
+impl BlockingReasons {
+    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        let mut reasons = enumset::EnumSet::empty();
+        g.retain(|_key, value| {
+            reasons = reasons.union(*value);
+            !value.is_empty()
+        });
+        if !g.is_empty() {
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        } else {
+            None
+        }
+    }
+
+    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        if g.is_empty() {
+            None
+        } else {
+            let reasons = g
+                .values()
+                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        }
+    }
+}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
    /// Pick the shard that holds this key
    Page(Key),
    /// The shard ID is known: pick the given shard
@@ -2092,6 +2094,7 @@ impl TenantManager {
                    };

                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return ShardResolveResult::Found(tenant.clone())
                        }
@@ -2173,9 +2176,6 @@ pub(crate) enum GetActiveTenantError {
    /// never happen.
    #[error("Tenant is broken: {0}")]
    Broken(String),
-
-    #[error("reconnect to switch tenant id")]
-    SwitchedTenant,
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,6 +800,123 @@ impl RemoteTimelineClient {
            .context("wait completion")
    }

+    /// Adds a gc blocking reason for this timeline if one does not exist already.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_insert_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
+                    drop(guard);
+                    panic!("cannot start detach ancestor if there is nothing to detach from");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                // Usual case: !wanted(x) && !wanted(y)
+                //
+                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
+                // turn on and off some reason.
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        // this could be avoided by having external in-memory synchronization, like
+                        // timeline detach ancestor
+                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
+                    }
+
+                    // at this point, the metadata must always show that there is a parent
+                    upload_queue.dirty.gc_blocking = current
+                        .map(|x| x.with_reason(reason))
+                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
+    /// Removes a gc blocking reason for this timeline if one exists.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_remove_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if !upload_queue
+                    .clean
+                    .0
+                    .lineage
+                    .is_detached_from_original_ancestor()
+                {
+                    drop(guard);
+                    panic!("cannot complete timeline_ancestor_detach while not detached");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| {
+                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
+            };
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
+                    }
+
+                    upload_queue.dirty.gc_blocking =
+                        current.as_ref().and_then(|x| x.without_reason(reason));
+                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
+                    // FIXME: bogus ?
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
    /// Launch an upload operation in the background; the file is added to be included in next
    /// `index_part.json` upload.
    pub(crate) fn schedule_layer_file_upload(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,6 +60,9 @@ pub struct IndexPart {
    #[serde(default)]
    pub(crate) lineage: Lineage,

+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) gc_blocking: Option<GcBlocking>,
+
    /// Describes the kind of aux files stored in the timeline.
    ///
    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -85,10 +88,11 @@ impl IndexPart {
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
    /// - 8: added `archived_at`
-    const LATEST_VERSION: usize = 8;
+    /// - 9: +gc_blocking
+    const LATEST_VERSION: usize = 9;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -101,6 +105,7 @@ impl IndexPart {
            deleted_at: None,
            archived_at: None,
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        }
    }
@@ -251,6 +256,64 @@ impl Lineage {
    }
 }

+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct GcBlocking {
+    pub(crate) started_at: NaiveDateTime,
+    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
+#[enumset(serialize_repr = "list")]
+pub(crate) enum GcBlockingReason {
+    Manual,
+    DetachAncestor,
+}
+
+impl GcBlocking {
+    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
+        GcBlocking {
+            started_at: chrono::Utc::now().naive_utc(),
+            reasons: enumset::EnumSet::only(reason),
+        }
+    }
+
+    /// Returns true if the given reason is one of the reasons why the gc is blocked.
+    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
+        self.reasons.contains(reason)
+    }
+
+    /// Returns a version of self with the given reason.
+    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
+        assert!(!self.blocked_by(reason));
+        let mut reasons = self.reasons;
+        reasons.insert(reason);
+
+        Self {
+            started_at: self.started_at,
+            reasons,
+        }
+    }
+
+    /// Returns a version of self without the given reason. Assumption is that if
+    /// there are no more reasons, we can unblock the gc by returning `None`.
+    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
+        assert!(self.blocked_by(reason));
+
+        if self.reasons.len() == 1 {
+            None
+        } else {
+            let mut reasons = self.reasons;
+            assert!(reasons.remove(reason));
+            assert!(!reasons.is_empty());
+
+            Some(Self {
+                started_at: self.started_at,
+                reasons,
+            })
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -292,6 +355,7 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -335,6 +399,7 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -379,6 +444,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -426,6 +492,7 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -468,6 +535,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -513,6 +581,7 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -563,6 +632,7 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
+            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

@@ -618,6 +688,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -674,6 +745,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -681,6 +753,68 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v9_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 9,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            }
+        }"#;
+
+        let expected = IndexPart {
+            version: 9,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -29,7 +29,6 @@ use utils::lsn::Lsn;
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
-pub use inmemory_layer::SerializedBatch;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};

@@ -436,21 +435,6 @@ impl ReadableLayer {
    }
 }

-/// Return value from [`Layer::get_value_reconstruct_data`]
-#[derive(Clone, Copy, Debug)]
-pub enum ValueReconstructResult {
-    /// Got all the data needed to reconstruct the requested page
-    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue,
-
-    /// This layer didn't contain data needed to reconstruct the page version at
-    /// the returned LSN. This is usually considered an error, but might be OK
-    /// in some circumstances.
-    Missing,
-}
-
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
@@ -555,19 +539,25 @@ impl LayerAccessStats {
        self.record_residence_event_at(SystemTime::now())
    }

-    pub(crate) fn record_access_at(&self, now: SystemTime) {
+    fn record_access_at(&self, now: SystemTime) -> bool {
        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);

        // A layer which is accessed must be visible.
        mask |= 0x1 << Self::VISIBILITY_SHIFT;
        value |= 0x1 << Self::VISIBILITY_SHIFT;

-        self.write_bits(mask, value);
+        let old_bits = self.write_bits(mask, value);
+        !matches!(
+            self.decode_visibility(old_bits),
+            LayerVisibilityHint::Visible
+        )
    }

-    pub(crate) fn record_access(&self, ctx: &RequestContext) {
+    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
+    /// as a result of this access
+    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return;
+            return false;
        }

        self.record_access_at(SystemTime::now())
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,6 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -72,9 +71,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};

 ///
 /// Header stored in the beginning of the file
@@ -199,7 +196,6 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
-    access_stats: LayerAccessStats,
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -298,7 +294,6 @@ impl DeltaLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -349,7 +344,6 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
-            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -814,95 +808,6 @@ impl DeltaLayerInner {
        })
    }

-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let mut need_image = true;
-        // Scan the page versions backwards, starting from `lsn`.
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            &block_reader,
-        );
-        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
-
-        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
-
-        tree_reader
-            .visit(
-                &search_key.0,
-                VisitDirection::Backwards,
-                |key, value| {
-                    let blob_ref = BlobRef(value);
-                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                        return false;
-                    }
-                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                    if entry_lsn < lsn_range.start {
-                        return false;
-                    }
-                    offsets.push((entry_lsn, blob_ref.pos()));
-
-                    !blob_ref.will_init()
-                },
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                    .build(),
-            )
-            .await?;
-
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        for (entry_lsn, pos) in offsets {
-            cursor
-                .read_blob_into_buf(pos, &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            match val {
-                Value::Image(img) => {
-                    reconstruct_state.img = Some((entry_lsn, img));
-                    need_image = false;
-                    break;
-                }
-                Value::WalRecord(rec) => {
-                    let will_init = rec.will_init();
-                    reconstruct_state.records.push((entry_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
-                        need_image = false;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,9 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
-};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -137,7 +134,6 @@ pub struct ImageLayer {
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
-    access_stats: LayerAccessStats,
    inner: OnceCell<ImageLayerInner>,
 }

@@ -255,7 +251,6 @@ impl ImageLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -306,7 +301,6 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
-            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -429,46 +423,6 @@ impl ImageLayerInner {
        })
    }

-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader
-            .get(
-                &keybuf,
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                    .build(),
-            )
-            .await?
-        {
-            let blob = block_reader
-                .block_cursor()
-                .read_blob(
-                    offset,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerValue)
-                        .build(),
-                )
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    pub(super) async fn get_values_reconstruct_data(
@@ -753,6 +707,10 @@ struct ImageLayerWriterInner {
 }

 impl ImageLayerWriterInner {
+    fn size(&self) -> u64 {
+        self.tree.borrow_writer().size() + self.blob_writer.size()
+    }
+
    ///
    /// Start building a new image layer.
    ///
@@ -1044,6 +1002,10 @@ impl ImageLayerWriter {
            .finish(timeline, ctx, Some(end_key))
            .await
    }
+
+    pub(crate) fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
 }

 impl Drop for ImageLayerWriter {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,10 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, ensure, Result};
+use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -32,46 +31,15 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValueReconstructState,
-    ValuesReconstructState,
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
 };

 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);

-#[derive(Ord, PartialOrd, Eq, PartialEq)]
-struct IndexPrefix {
-    field1: u8,
-    field2: u32,
-    field3: u32,
-    field4: u32,
-    field5: u8,
-}
-
-fn materialize_key(prefix: &IndexPrefix, blkno: u32) -> Key {
-    Key {
-        field1: prefix.field1,
-        field2: prefix.field2,
-        field3: prefix.field3,
-        field4: prefix.field4,
-        field5: prefix.field5,
-        field6: blkno,
-    }
-}
-
-fn key_to_prefix(key: &Key) -> IndexPrefix {
-    IndexPrefix {
-        field1: key.field1,
-        field2: key.field2,
-        field3: key.field3,
-        field4: key.field4,
-        field5: key.field5,
-    }
-}
-
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
@@ -86,9 +54,6 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    pub(crate) end_lsn: OnceLock<Lsn>,

-    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
-    local_path_str: Arc<str>,
-
    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
    frozen_local_path_str: OnceLock<Arc<str>>,

@@ -113,7 +78,7 @@ pub struct InMemoryLayerInner {
    /// All versions of all pages in the layer are kept here. Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<IndexPrefix, BTreeMap<u32, VecMap<Lsn, u64>>>,
+    index: BTreeMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -279,12 +244,6 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        self.frozen_local_path_str
-            .get()
-            .unwrap_or(&self.local_path_str)
-    }
-
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -304,96 +263,36 @@ impl InMemoryLayer {

        let cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
-        for (key_prefix, inner) in inner.index.iter() {
-            for (blkno, vec_map) in inner {
-                let key = materialize_key(key_prefix, *blkno);
-
-                for (lsn, pos) in vec_map.as_slice() {
-                    let mut desc = String::new();
-                    cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                    let val = Value::des(&buf);
-                    match val {
-                        Ok(Value::Image(img)) => {
-                            write!(&mut desc, " img {} bytes", img.len())?;
-                        }
-                        Ok(Value::WalRecord(rec)) => {
-                            let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                            write!(
-                                &mut desc,
-                                " rec {} bytes will_init: {} {}",
-                                buf.len(),
-                                rec.will_init(),
-                                wal_desc
-                            )?;
-                        }
-                        Err(err) => {
-                            write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
-                        }
+        for (key, vec_map) in inner.index.iter() {
+            for (lsn, pos) in vec_map.as_slice() {
+                let mut desc = String::new();
+                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                let val = Value::des(&buf);
+                match val {
+                    Ok(Value::Image(img)) => {
+                        write!(&mut desc, " img {} bytes", img.len())?;
+                    }
+                    Ok(Value::WalRecord(rec)) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
+                        write!(
+                            &mut desc,
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )?;
+                    }
+                    Err(err) => {
+                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
                    }
-                    println!("  key {} at {}: {}", key, lsn, desc);
                }
+                println!("  key {} at {}: {}", key, lsn, desc);
            }
        }

        Ok(())
    }

-    /// Look up given value in the layer.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.start_lsn);
-        let mut need_image = true;
-
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-
-        let inner = self.inner.read().await;
-
-        let reader = inner.file.block_cursor();
-
-        // Scan the page versions backwards, starting from `lsn`.
-        if let Some(inner) = inner.index.get(&key_to_prefix(&key)) {
-            if let Some(vec_map) = inner.get(&key.field6) {
-                let slice = vec_map.slice_range(lsn_range);
-                for (entry_lsn, pos) in slice.iter().rev() {
-                    let buf = reader.read_blob(*pos, &ctx).await?;
-                    let value = Value::des(&buf)?;
-                    match value {
-                        Value::Image(img) => {
-                            reconstruct_state.img = Some((*entry_lsn, img));
-                            return Ok(ValueReconstructResult::Complete);
-                        }
-                        Value::WalRecord(rec) => {
-                            let will_init = rec.will_init();
-                            reconstruct_state.records.push((*entry_lsn, rec));
-                            if will_init {
-                                // This WAL record initializes the page, so no need to go further back
-                                need_image = false;
-                                break;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        // release lock on 'inner'
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -413,54 +312,34 @@ impl InMemoryLayer {
        let reader = inner.file.block_cursor();

        for range in keyspace.ranges.iter() {
-            let range_incl = range.start..=Key::from_i128(Key::to_i128(&range.end) - 1);
-
-            let prefix_start = key_to_prefix(&range.start);
-            let prefix_end = key_to_prefix(&range.end);
-
-            for (prefix, relation_idx) in inner.index.range(prefix_start..=prefix_end) {
-                let blkno_start = if prefix == &key_to_prefix(&range_incl.start()) {
-                    range_incl.start().field6
-                } else {
-                    0
+            for (key, vec_map) in inner.index.range(range.start..range.end) {
+                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
+                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                    None => self.start_lsn..end_lsn,
                };

-                let blkno_end = if prefix == &key_to_prefix(&range_incl.end()) {
-                    range_incl.end().field6
-                } else {
-                    0xffffffff
-                };
+                let slice = vec_map.slice_range(lsn_range);

-                for (blkno, vec_map) in relation_idx.range(blkno_start..=blkno_end) {
-                    let key = materialize_key(prefix, *blkno);
-                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
-                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                        None => self.start_lsn..end_lsn,
-                    };
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
+                    let buf = reader.read_blob(*pos, &ctx).await;
+                    if let Err(e) = buf {
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        break;
+                    }

-                    let slice = vec_map.slice_range(lsn_range);
+                    let value = Value::des(&buf.unwrap());
+                    if let Err(e) = value {
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        break;
+                    }

-                    for (entry_lsn, pos) in slice.iter().rev() {
-                        // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-                        let buf = reader.read_blob(*pos, &ctx).await;
-                        if let Err(e) = buf {
-                            reconstruct_state
-                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                            break;
-                        }
-
-                        let value = Value::des(&buf.unwrap());
-                        if let Err(e) = value {
-                            reconstruct_state
-                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                            break;
-                        }
-
-                        let key_situation =
-                            reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
-                        if key_situation == ValueReconstructSituation::Complete {
-                            break;
-                        }
+                    let key_situation =
+                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
+                    if key_situation == ValueReconstructSituation::Complete {
+                        break;
                    }
                }
            }
@@ -472,74 +351,6 @@ impl InMemoryLayer {
    }
 }

-pub struct SerializedBatch {
-    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
-    pub(crate) raw: Vec<u8>,
-
-    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
-    pub(crate) offsets: Vec<(Key, Lsn, u64)>,
-
-    /// The highest LSN of any value in the batch
-    pub(crate) max_lsn: Lsn,
-}
-
-impl SerializedBatch {
-    /// Write a blob length in the internal format of the EphemeralFile
-    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
-        use std::io::Write;
-
-        if len < 0x80 {
-            // short one-byte length header
-            let len_buf = [len as u8];
-
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        } else {
-            let mut len_buf = u32::to_be_bytes(len as u32);
-            len_buf[0] |= 0x80;
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        }
-    }
-
-    pub fn from_values(batch: Vec<(Key, Lsn, Value)>) -> Self {
-        use std::io::Write;
-
-        let mut offsets: Vec<(Key, Lsn, u64)> = Vec::new();
-        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(batch.len() * 8192));
-        let mut max_lsn: Lsn = Lsn(0);
-        let mut value_buf = smallvec::SmallVec::<[u8; 256]>::new();
-        for (key, lsn, val) in batch {
-            let relative_off = cursor.position();
-
-            value_buf.clear();
-            val.ser_into(&mut value_buf)
-                .expect("Value serialization is infallible");
-            Self::write_blob_length(value_buf.len(), &mut cursor);
-
-            cursor
-                .write_all(&value_buf)
-                .expect("Writing to Vec is infallible");
-
-            // We can't write straight into the buffer, because the InMemoryLayer file format requires
-            // the size to come before the value.  However... we could probably calculate the size before
-            // actually serializing the value
-            //val.ser_into(&mut cursor)?;
-
-            offsets.push((key, lsn, relative_off));
-            max_lsn = std::cmp::max(max_lsn, lsn);
-        }
-
-        Self {
-            raw: cursor.into_inner(),
-            offsets,
-            max_lsn,
-        }
-    }
-}
-
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -583,11 +394,6 @@ impl InMemoryLayer {

        Ok(InMemoryLayer {
            file_id: key,
-            local_path_str: {
-                let mut buf = String::new();
-                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
-                buf.into()
-            },
            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
@@ -603,20 +409,37 @@ impl InMemoryLayer {
        })
    }

-    // Write path.
-    pub async fn put_batch(
+    // Write operations
+
+    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
+    /// Adds the page version to the in-memory tree
+    pub async fn put_value(
        &self,
-        serialized_batch: &SerializedBatch,
+        key: Key,
+        lsn: Lsn,
+        buf: &[u8],
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
-        //self.assert_writable();
+        self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+    }

-        let base_off = {
-            inner
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: Key,
+        lsn: Lsn,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
+
+        let off = {
+            locked_inner
                .file
-                .write_raw(
-                    &serialized_batch.raw,
+                .write_blob(
+                    buf,
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::InMemoryLayer)
                        .build(),
@@ -624,21 +447,15 @@ impl InMemoryLayer {
                .await?
        };

-        for (key, lsn, relative_off) in &serialized_batch.offsets {
-            let prefix = key_to_prefix(&key);
-
-            let relation_idx = match inner.index.get_mut(&prefix) {
-                Some(i) => i,
-                None => inner.index.entry(prefix).or_default(),
-            };
-
-            let off = base_off + relative_off;
-            let vec_map = relation_idx.entry(key.field6).or_default();
-            vec_map.append_fast(*lsn, off);
+        let vec_map = locked_inner.index.entry(key).or_default();
+        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!("Key {} at {} already exists", key, lsn);
        }

-        let size = inner.file.len();
-        inner.resource_units.maybe_publish_size(size);
+        let size = locked_inner.file.len();
+        locked_inner.resource_units.maybe_publish_size(size);

        Ok(())
    }
@@ -678,15 +495,15 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        // #[cfg(debug_assertions)]
-        // {
-        //     let inner = self.inner.write().await;
-        //     for vec_map in inner.index.values() {
-        //         for (lsn, _pos) in vec_map.as_slice() {
-        //             assert!(*lsn < end_lsn);
-        //         }
-        //     }
-        // }
+        #[cfg(debug_assertions)]
+        {
+            let inner = self.inner.write().await;
+            for vec_map in inner.index.values() {
+                for (lsn, _pos) in vec_map.as_slice() {
+                    assert!(*lsn < end_lsn);
+                }
+            }
+        }
    }

    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
@@ -720,12 +537,11 @@ impl InMemoryLayer {
        let end_lsn = *self.end_lsn.get().unwrap();

        let key_count = if let Some(key_range) = key_range {
-            panic!("Update for IndexPrefix");
-            // inner
-            //     .index
-            //     .iter()
-            //     .filter(|(k, _)| key_range.contains(k))
-            //     .count()
+            inner
+                .index
+                .iter()
+                .filter(|(k, _)| key_range.contains(k))
+                .count()
        } else {
            inner.index.len()
        };
@@ -753,20 +569,16 @@ impl InMemoryLayer {

                let cursor = inner.file.block_cursor();

-                for (key_prefix, inner) in inner.index.iter() {
-                    for (blkno, vec_map) in inner {
-                        let key = materialize_key(key_prefix, *blkno);
-
-                        // Write all page versions
-                        for (lsn, pos) in vec_map.as_slice() {
-                            cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                            let will_init = Value::des(&buf)?.will_init();
-                            let res;
-                            (buf, res) = delta_layer_writer
-                                .put_value_bytes(key, *lsn, buf, will_init, &ctx)
-                                .await;
-                            res?;
-                        }
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                            .await;
+                        res?;
                    }
                }
            }
@@ -790,25 +602,22 @@ impl InMemoryLayer {

                let mut buf = Vec::new();

-                for (key_prefix, inner) in inner.index.iter() {
-                    for (blkno, vec_map) in inner {
-                        // Write all page versions
-                        let key = materialize_key(key_prefix, *blkno);
-                        for (lsn, pos) in vec_map.as_slice() {
-                            // TODO: once we have blob lengths in the in-memory index, we can
-                            // 1. get rid of the blob_io / BlockReaderRef::Slice business and
-                            // 2. load the file contents into a Bytes and
-                            // 3. the use `Bytes::slice` to get the `buf` that is our blob
-                            // 4. pass that `buf` into `put_value_bytes`
-                            // => https://github.com/neondatabase/neon/issues/8183
-                            cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                            let will_init = Value::des(&buf)?.will_init();
-                            let res;
-                            (buf, res) = delta_layer_writer
-                                .put_value_bytes(key, *lsn, buf, will_init, ctx)
-                                .await;
-                            res?;
-                        }
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        // TODO: once we have blob lengths in the in-memory index, we can
+                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
+                        // 2. load the file contents into a Bytes and
+                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
+                        // 4. pass that `buf` into `put_value_bytes`
+                        // => https://github.com/neondatabase/neon/issues/8183
+                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
+                            .await;
+                        res?;
                    }
                }
            }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
-    ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -301,42 +300,6 @@ impl Layer {
        self.0.delete_on_drop();
    }

-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from the previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use anyhow::ensure;
-
-        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0.access_stats.record_access(ctx);
-
-        if self.layer_desc().is_delta {
-            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
-            ensure!(self.layer_desc().key_range.contains(&key));
-        } else {
-            ensure!(self.layer_desc().key_range.contains(&key));
-            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
-            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
-        }
-
-        layer
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
-            .await
-            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
-    }
-
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -353,7 +316,7 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.0.access_stats.record_access(ctx);
+        self.record_access(ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -433,18 +396,18 @@ impl Layer {
        self.0.info(reset)
    }

-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.0.access_stats
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        self.0.access_stats.latest_activity()
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        self.0.access_stats.visibility()
    }

    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.0.path
    }

-    pub(crate) fn debug_str(&self) -> &Arc<str> {
-        &self.0.debug_str
-    }
-
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -488,13 +451,31 @@ impl Layer {
        }
    }

+    fn record_access(&self, ctx: &RequestContext) {
+        if self.0.access_stats.record_access(ctx) {
+            // Visibility was modified to Visible
+            tracing::info!(
+                "Layer {} became visible as a result of access",
+                self.0.desc.key()
+            );
+            if let Some(tl) = self.0.timeline.upgrade() {
+                tl.metrics
+                    .visible_physical_size_gauge
+                    .add(self.0.desc.file_size)
+            }
+        }
+    }
+
    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.access_stats().set_visibility(visibility.clone());
+        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
        use LayerVisibilityHint::*;
        match (old_visibility, visibility) {
            (Visible, Covered) => {
                // Subtract this layer's contribution to the visible size metric
                if let Some(tl) = self.0.timeline.upgrade() {
+                    debug_assert!(
+                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
+                    );
                    tl.metrics
                        .visible_physical_size_gauge
                        .sub(self.0.desc.file_size)
@@ -519,7 +500,7 @@ impl Layer {
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_value_reconstruct_data`].
+/// read with [`Layer::get_values_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -600,9 +581,6 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

-    /// String representation of the layer, used for traversal id.
-    debug_str: Arc<str>,
-
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -715,6 +693,9 @@ impl Drop for LayerInner {
            }

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                debug_assert!(
+                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
+                );
                timeline
                    .metrics
                    .visible_physical_size_gauge
@@ -836,9 +817,6 @@ impl LayerInner {

        LayerInner {
            conf,
-            debug_str: {
-                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
-            },
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1759,28 +1737,6 @@ impl DownloadedLayer {
            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
    }

-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => {
-                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-                    .await
-            }
-            Image(i) => {
-                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
-                    .await
-            }
-        }
-    }
-
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -1879,7 +1835,7 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                owner.access_stats.record_access(ctx);
+                self.owner.record_access(ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -50,13 +50,26 @@ async fn smoke_test() {
    // all layers created at pageserver are like `layer`, initialized with strong
    // Arc<DownloadedLayer>.

+    let controlfile_keyspace = KeySpace {
+        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
+    };
+
    let img_before = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
        layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
            .await
            .unwrap();
-        data.img
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
            .take()
            .expect("tenant harness writes the control file")
    };
@@ -74,13 +87,24 @@ async fn smoke_test() {

    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
        layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
            .instrument(download_span.clone())
            .await
            .unwrap();
-        data.img.take().unwrap()
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
+            .take()
+            .expect("tenant harness writes the control file")
    };

    assert_eq!(img_before, img_after);
@@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
    assert_eq!(size_of::<LayerAccessStats>(), 8);
    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 312);
+    assert_eq!(size_of::<LayerInner>(), 296);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        error_run_count += 1;
                        let wait_duration = Duration::from_secs_f64(wait_duration);

-                        error!(
-                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
-                    );
+                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
+                            // Timeline was cancelled during gc. We might either be in an event
+                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
+                            // or in one that affects the timeline only (timeline deletion).
+                            // Therefore, don't exit the loop.
+                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        } else {
+                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        }
+
                        wait_duration
                    }
                }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,7 +3,6 @@ pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
-pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -18,12 +17,11 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
-use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -47,6 +45,7 @@ use utils::{
    bin_ser::BeSer,
    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
+    vec_map::VecMap,
 };

 use std::pin::pin;
@@ -58,10 +57,7 @@ use std::{
    collections::{BTreeMap, HashMap, HashSet},
    sync::atomic::AtomicU64,
 };
-use std::{
-    cmp::{max, min},
-    ops::ControlFlow,
-};
+use std::{cmp::min, ops::ControlFlow};
 use std::{
    collections::btree_map::Entry,
    ops::{Deref, Range},
@@ -86,8 +82,8 @@ use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
        AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
-        ValueReconstructState, ValuesReconstructState,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
+        ValuesReconstructState,
    },
 };
 use crate::{
@@ -139,7 +135,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{config::TenantConf, storage_layer::inmemory_layer, upload_queue::NotInitialized};
+use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -429,8 +425,6 @@ pub struct Timeline {
    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,

    pub(crate) l0_flush_global_state: L0FlushGlobalState,
-
-    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 }

 pub struct WalReceiverInfo {
@@ -542,7 +536,6 @@ pub struct MissingKeyError {
    cont_lsn: Lsn,
    request_lsn: Lsn,
    ancestor_lsn: Option<Lsn>,
-    traversal_path: Vec<TraversalPathItem>,
    backtrace: Option<std::backtrace::Backtrace>,
 }

@@ -563,18 +556,6 @@ impl std::fmt::Display for MissingKeyError {
            write!(f, ", ancestor {}", ancestor_lsn)?;
        }

-        if !self.traversal_path.is_empty() {
-            writeln!(f)?;
-        }
-
-        for (r, c, l) in &self.traversal_path {
-            writeln!(
-                f,
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r, c, l,
-            )?;
-        }
-
        if let Some(ref backtrace) = self.backtrace {
            write!(f, "\n{}", backtrace)?;
        }
@@ -703,6 +684,7 @@ pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
    EnhancedGcBottomMostCompaction,
+    DryRun,
 }

 impl std::fmt::Debug for Timeline {
@@ -916,119 +898,44 @@ impl Timeline {

        self.timeline_get_throttle.throttle(ctx, 1).await;

-        match self.conf.get_impl {
-            GetImpl::Legacy => {
-                let reconstruct_state = ValueReconstructState {
-                    records: Vec::new(),
-                    img: None,
-                };
+        let keyspace = KeySpace {
+            ranges: vec![key..key.next()],
+        };

-                self.get_impl(key, lsn, reconstruct_state, ctx).await
-            }
-            GetImpl::Vectored => {
-                let keyspace = KeySpace {
-                    ranges: vec![key..key.next()],
-                };
+        // Initialise the reconstruct state for the key with the cache
+        // entry returned above.
+        let mut reconstruct_state = ValuesReconstructState::new();

-                // Initialise the reconstruct state for the key with the cache
-                // entry returned above.
-                let mut reconstruct_state = ValuesReconstructState::new();
+        let vectored_res = self
+            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+            .await;

-                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                let key_value = vectored_res?.pop_first();
-                match key_value {
-                    Some((got_key, value)) => {
-                        if got_key != key {
-                            error!(
-                                "Expected {}, but singular vectored get returned {}",
-                                key, got_key
-                            );
-                            Err(PageReconstructError::Other(anyhow!(
-                                "Singular vectored get returned wrong key"
-                            )))
-                        } else {
-                            value
-                        }
-                    }
-                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn: Lsn(0),
-                        request_lsn: lsn,
-                        ancestor_lsn: None,
-                        traversal_path: Vec::new(),
-                        backtrace: None,
-                    })),
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
                }
            }
+            None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                key,
+                shard: self.shard_identity.get_shard_number(&key),
+                cont_lsn: Lsn(0),
+                request_lsn: lsn,
+                ancestor_lsn: None,
+                backtrace: None,
+            })),
        }
    }

-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
-
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(GetKind::Singular)
-            .start_timer();
-        let path = self
-            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
-            .await?;
-        timer.stop_and_record();
-
-        let start = Instant::now();
-        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
-        let elapsed = start.elapsed();
-        crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(GetKind::Singular)
-            .observe(elapsed.as_secs_f64());
-
-        if cfg!(feature = "testing")
-            && res.is_err()
-            && !matches!(res, Err(PageReconstructError::Cancelled))
-        {
-            // it can only be walredo issue
-            use std::fmt::Write;
-
-            let mut msg = String::new();
-
-            path.into_iter().for_each(|(res, cont_lsn, layer)| {
-                writeln!(
-                    msg,
-                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer,
-                )
-                .expect("string grows")
-            });
-
-            // this is to rule out or provide evidence that we could in some cases read a duplicate
-            // walrecord
-            tracing::info!("walredo failed, path:\n{msg}");
-        }
-
-        res
-    }
-
    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;

@@ -1078,28 +985,14 @@ impl Timeline {
            .throttle(ctx, key_count as usize)
            .await;

-        let res = match self.conf.get_vectored_impl {
-            GetVectoredImpl::Sequential => {
-                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
-            }
-            GetVectoredImpl::Vectored => {
-                let vectored_res = self
-                    .get_vectored_impl(
-                        keyspace.clone(),
-                        lsn,
-                        &mut ValuesReconstructState::new(),
-                        ctx,
-                    )
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                vectored_res
-            }
-        };
+        let res = self
+            .get_vectored_impl(
+                keyspace.clone(),
+                lsn,
+                &mut ValuesReconstructState::new(),
+                ctx,
+            )
+            .await;

        if let Some((metric, start)) = start {
            let elapsed = start.elapsed();
@@ -1188,65 +1081,6 @@ impl Timeline {
        vectored_res
    }

-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn get_vectored_sequential_impl(
-        &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let mut values = BTreeMap::new();
-
-        for range in keyspace.ranges {
-            let mut key = range.start;
-            while key != range.end {
-                let block = self
-                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
-                    .await;
-
-                use PageReconstructError::*;
-                match block {
-                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
-                    Err(MissingKey(_))
-                        if NON_INHERITED_RANGE.contains(&key)
-                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
-                    {
-                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
-                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
-                        // accordingly.
-                        key = key.next();
-                    }
-                    Err(MissingKey(err)) => {
-                        return Err(GetVectoredError::MissingKey(err));
-                    }
-                    Err(Other(err))
-                        if err
-                            .to_string()
-                            .contains("downloading evicted layer file failed") =>
-                    {
-                        return Err(GetVectoredError::Other(err))
-                    }
-                    Err(Other(err))
-                        if err
-                            .chain()
-                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
-                    {
-                        // The intent here is to achieve error parity with the vectored read path.
-                        // When vectored read fails to load a layer it fails the whole read, hence
-                        // we mimic this behaviour here to keep the validation happy.
-                        return Err(GetVectoredError::Other(err));
-                    }
-                    _ => {
-                        values.insert(key, block);
-                        key = key.next();
-                    }
-                }
-            }
-        }
-
-        Ok(values)
-    }
-
    pub(super) async fn get_vectored_impl(
        &self,
        keyspace: KeySpace,
@@ -1317,113 +1151,6 @@ impl Timeline {
        Ok(results)
    }

-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn validate_get_vectored_impl(
-        &self,
-        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) {
-        if keyspace.overlaps(&Key::metadata_key_range()) {
-            // skip validation for metadata key range
-            return;
-        }
-
-        let sequential_res = self
-            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
-            .await;
-
-        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
-            use GetVectoredError::*;
-            match (lhs, rhs) {
-                (Oversized(l), Oversized(r)) => l == r,
-                (InvalidLsn(l), InvalidLsn(r)) => l == r,
-                (MissingKey(l), MissingKey(r)) => l.key == r.key,
-                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
-                (Other(_), Other(_)) => true,
-                _ => false,
-            }
-        }
-
-        match (&sequential_res, vectored_res) {
-            (Err(GetVectoredError::Cancelled), _) => {},
-            (_, Err(GetVectoredError::Cancelled)) => {},
-            (Err(seq_err), Ok(_)) => {
-                panic!(concat!("Sequential get failed with {}, but vectored get did not",
-                               " - keyspace={:?} lsn={}"),
-                       seq_err, keyspace, lsn) },
-            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
-                // Sequential get runs after vectored get, so it is possible for the later
-                // to time out while waiting for its ancestor's Lsn to become ready and for the
-                // former to succeed (it essentially has a doubled wait time).
-            },
-            (Ok(_), Err(vec_err)) => {
-                panic!(concat!("Vectored get failed with {}, but sequential get did not",
-                               " - keyspace={:?} lsn={}"),
-                       vec_err, keyspace, lsn) },
-            (Err(seq_err), Err(vec_err)) => {
-                assert!(errors_match(seq_err, vec_err),
-                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
-            (Ok(seq_values), Ok(vec_values)) => {
-                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
-                    assert_eq!(seq_key, vec_key);
-                    match (seq_res, vec_res) {
-                        (Ok(seq_blob), Ok(vec_blob)) => {
-                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
-                        },
-                        (Err(err), Ok(_)) => {
-                            panic!(
-                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Ok(_), Err(err)) => {
-                            panic!(
-                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Err(_), Err(_)) => {}
-                    }
-                })
-            }
-        }
-    }
-
-    fn validate_key_equivalence(
-        key: &Key,
-        keyspace: &KeySpace,
-        lsn: Lsn,
-        seq: &Bytes,
-        vec: &Bytes,
-    ) {
-        if *key == AUX_FILES_KEY {
-            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
-            // since it uses a hash map under the hood. Hence, deserialise both results
-            // before comparing.
-            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
-            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
-            match (&seq_aux_dir_res, &vec_aux_dir_res) {
-                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
-                    assert_eq!(
-                        seq_aux_dir, vec_aux_dir,
-                        "Mismatch for key {} - keyspace={:?} lsn={}",
-                        key, keyspace, lsn
-                    );
-                }
-                (Err(_), Err(_)) => {}
-                _ => {
-                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
-                }
-            }
-        } else {
-            // All other keys should reconstruct deterministically, so we simply compare the blobs.
-            assert_eq!(
-                seq, vec,
-                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
-            );
-        }
-    }
-
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
@@ -1925,9 +1652,6 @@ impl Timeline {
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

-        // Ensure Prevent new page service requests from starting.
-        self.handles.shutdown();
-
        // Transition the remote_client into a state where it's only useful for timeline deletion.
        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
        self.remote_client.stop();
@@ -2453,8 +2177,6 @@ impl Timeline {
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),

                l0_flush_global_state: resources.l0_flush_global_state,
-
-                handles: Default::default(),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -3188,14 +2910,22 @@ impl Timeline {

        let guard = self.layers.read().await;

-        let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.access_stats().latest_activity();
-
-            HeatMapLayer::new(
-                layer.layer_desc().layer_name(),
-                layer.metadata(),
-                last_activity_ts,
-            )
+        let resident = guard.likely_resident_layers().filter_map(|layer| {
+            match layer.visibility() {
+                LayerVisibilityHint::Visible => {
+                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
+                    let last_activity_ts = layer.latest_activity();
+                    Some(HeatMapLayer::new(
+                        layer.layer_desc().layer_name(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
+                }
+                LayerVisibilityHint::Covered => {
+                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
+                    None
+                }
+            }
        });

        let layers = resident.collect();
@@ -3213,228 +2943,7 @@ impl Timeline {
    }
 }

-type TraversalId = Arc<str>;
-
-trait TraversalLayerExt {
-    fn traversal_id(&self) -> TraversalId;
-}
-
-impl TraversalLayerExt for Layer {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.debug_str())
-    }
-}
-
-impl TraversalLayerExt for Arc<InMemoryLayer> {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.local_path_str())
-    }
-}
-
 impl Timeline {
-    ///
-    /// Get a handle to a Layer for reading.
-    ///
-    /// The returned Layer might be from an ancestor timeline, if the
-    /// segment hasn't been updated on this timeline yet.
-    ///
-    /// This function takes the current timeline's locked LayerMap as an argument,
-    /// so callers can avoid potential race conditions.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
-    async fn get_reconstruct_data(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
-        // Start from the current timeline.
-        let mut timeline_owned;
-        let mut timeline = self;
-
-        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
-        });
-
-        // For debugging purposes, collect the path of layers that we traversed
-        // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path = Vec::<TraversalPathItem>::new();
-
-        let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
-            *cached_lsn
-        } else {
-            Lsn(0)
-        };
-
-        // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
-        // to check that each iteration make some progress, to break infinite
-        // looping if something goes wrong.
-        let mut prev_lsn = None;
-
-        let mut result = ValueReconstructResult::Continue;
-        let mut cont_lsn = Lsn(request_lsn.0 + 1);
-
-        'outer: loop {
-            if self.cancel.is_cancelled() {
-                return Err(PageReconstructError::Cancelled);
-            }
-
-            // The function should have updated 'state'
-            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
-            match result {
-                ValueReconstructResult::Complete => return Ok(traversal_path),
-                ValueReconstructResult::Continue => {
-                    // If we reached an earlier cached page image, we're done.
-                    if cont_lsn == cached_lsn + 1 {
-                        return Ok(traversal_path);
-                    }
-                    if let Some(prev) = prev_lsn {
-                        if prev <= cont_lsn {
-                            // Didn't make any progress in last iteration. Error out to avoid
-                            // getting stuck in the loop.
-                            return Err(PageReconstructError::MissingKey(MissingKeyError {
-                                key,
-                                shard: self.shard_identity.get_shard_number(&key),
-                                cont_lsn: Lsn(cont_lsn.0 - 1),
-                                request_lsn,
-                                ancestor_lsn: Some(timeline.ancestor_lsn),
-                                traversal_path,
-                                backtrace: None,
-                            }));
-                        }
-                    }
-                    prev_lsn = Some(cont_lsn);
-                }
-                ValueReconstructResult::Missing => {
-                    return Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn,
-                        request_lsn,
-                        ancestor_lsn: None,
-                        traversal_path,
-                        backtrace: if cfg!(test) {
-                            Some(std::backtrace::Backtrace::force_capture())
-                        } else {
-                            None
-                        },
-                    }));
-                }
-            }
-
-            // Recurse into ancestor if needed
-            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
-                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                    trace!(
-                        "going into ancestor {}, cont_lsn is {}",
-                        timeline.ancestor_lsn,
-                        cont_lsn
-                    );
-
-                    timeline_owned = timeline
-                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                        .await?;
-                    timeline = &*timeline_owned;
-                    prev_lsn = None;
-                    continue 'outer;
-                }
-            }
-
-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
-
-            // Check the open and frozen in-memory layers first, in order from newest
-            // to oldest.
-            if let Some(open_layer) = &layers.open_layer {
-                let start_lsn = open_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
-                    // Get all the data needed to reconstruct the page version from this layer.
-                    // But if we have an older cached page image, no need to go past that.
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let open_layer = open_layer.clone();
-                    drop(guard);
-
-                    result = match open_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-            for frozen_layer in layers.frozen_layers.iter().rev() {
-                let start_lsn = frozen_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let frozen_layer = frozen_layer.clone();
-                    drop(guard);
-
-                    result = match frozen_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                let layer = guard.get_from_desc(&layer);
-                drop(guard);
-                // Get all the data needed to reconstruct the page version from this layer.
-                // But if we have an older cached page image, no need to go past that.
-                let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = match layer
-                    .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx)
-                    .await
-                {
-                    Ok(result) => result,
-                    Err(e) => return Err(PageReconstructError::from(e)),
-                };
-                cont_lsn = lsn_floor;
-                *read_count += 1;
-                traversal_path.push((result, cont_lsn, layer.traversal_id()));
-                continue 'outer;
-            } else if timeline.ancestor_timeline.is_some() {
-                // Nothing on this timeline. Traverse to parent
-                result = ValueReconstructResult::Continue;
-                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
-                continue 'outer;
-            } else {
-                // Nothing found
-                result = ValueReconstructResult::Missing;
-                continue 'outer;
-            }
-        }
-    }
-
    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
@@ -3528,7 +3037,6 @@ impl Timeline {
                cont_lsn,
                request_lsn,
                ancestor_lsn: Some(timeline.ancestor_lsn),
-                traversal_path: vec![],
                backtrace: None,
            }));
        }
@@ -3728,17 +3236,6 @@ impl Timeline {
        &self.shard_identity
    }

-    #[inline(always)]
-    pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
-        ShardTimelineId {
-            shard_index: ShardIndex {
-                shard_number: self.shard_identity.number,
-                shard_count: self.shard_identity.count,
-            },
-            timeline_id: self.timeline_id,
-        }
-    }
-
    ///
    /// Get a handle to the latest layer for appending.
    ///
@@ -4121,17 +3618,11 @@ impl Timeline {

    /// Return true if the value changed
    ///
-    /// This function must only be used from the layer flush task, and may not be called concurrently.
+    /// This function must only be used from the layer flush task.
    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
-        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
-        let old_value = self.disk_consistent_lsn.load();
-        if new_value != old_value {
-            assert!(new_value >= old_value);
-            self.disk_consistent_lsn.store(new_value);
-            true
-        } else {
-            false
-        }
+        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
+        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+        new_value != old_value
    }

    /// Update metadata file
@@ -4728,6 +4219,12 @@ impl Timeline {
            return;
        }

+        if self.current_logical_size.current_size().is_exact() {
+            // root timelines are initialized with exact count, but never start the background
+            // calculation
+            return;
+        }
+
        if let Some(await_bg_cancel) = self
            .current_logical_size
            .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -5675,7 +5172,7 @@ impl Timeline {
                let file_size = layer.layer_desc().file_size;
                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));

-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();

                EvictionCandidate {
                    layer: layer.into(),
@@ -5698,6 +5195,22 @@ impl Timeline {
        }
    }

+    /// Persistently blocks gc for `Manual` reason.
+    ///
+    /// Returns true if no such block existed before, false otherwise.
+    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
+    }
+
+    /// Persistently unblocks gc for `Manual` reason.
+    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
+    }
+
    #[cfg(test)]
    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
        self.last_record_lsn.advance(new_lsn);
@@ -5878,8 +5391,6 @@ impl Timeline {
    }
 }

-type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
-
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.
@@ -5931,6 +5442,44 @@ enum OpenLayerAction {
 }

 impl<'a> TimelineWriter<'a> {
+    /// Put a new page version that can be constructed from a WAL record
+    ///
+    /// This will implicitly extend the relation, if the page is beyond the
+    /// current end-of-file.
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
+        let res = layer.put_value(key, lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
+    }
+
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -6033,52 +5582,18 @@ impl<'a> TimelineWriter<'a> {
    }

    /// Put a batch of keys at the specified Lsns.
+    ///
+    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
    pub(crate) async fn put_batch(
        &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        batch: VecMap<Lsn, (Key, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if batch.is_empty() {
-            return Ok(());
+        for (lsn, (key, val)) in batch {
+            self.put(key, lsn, &val, ctx).await?
        }

-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
-        let batch_max_lsn = serialized_batch.max_lsn;
-        let buf_size: u64 = serialized_batch.raw.len() as u64;
-
-        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
-        let layer = self
-            .handle_open_layer_action(batch_max_lsn, action, ctx)
-            .await?;
-
-        let res = layer.put_batch(&serialized_batch, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(batch_max_lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
-        }
-
-        res
-    }
-
-    #[cfg(test)]
-    /// Test helper, for tests that would like to poke individual values without composing a batch
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.put_batch(vec![(key, lsn, value.clone())], ctx).await
+        Ok(())
    }

    pub(crate) async fn delete_batch(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,8 +19,10 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
+use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -41,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
+use crate::walrecord::NeonWalRecord;

 use utils::lsn::Lsn;

@@ -73,6 +76,7 @@ impl KeyHistoryRetention {
        key: Key,
        delta_writer: &mut Vec<(Key, Lsn, Value)>,
        mut image_writer: Option<&mut ImageLayerWriter>,
+        stat: &mut CompactionStatistics,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
@@ -82,6 +86,7 @@ impl KeyHistoryRetention {
                    let Value::Image(img) = &logs[0].1 else {
                        unreachable!()
                    };
+                    stat.produce_image_key(img);
                    if let Some(image_writer) = image_writer.as_mut() {
                        image_writer.put_image(key, img.clone(), ctx).await?;
                    } else {
@@ -89,24 +94,111 @@ impl KeyHistoryRetention {
                    }
                } else {
                    for (lsn, val) in logs {
+                        stat.produce_key(&val);
                        delta_writer.push((key, lsn, val));
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
+                    stat.produce_key(&val);
                    delta_writer.push((key, lsn, val));
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
+            stat.produce_key(&val);
            delta_writer.push((key, lsn, val));
        }
        Ok(())
    }
 }

+#[derive(Debug, Serialize, Default)]
+struct CompactionStatisticsNumSize {
+    num: u64,
+    size: u64,
+}
+
+#[derive(Debug, Serialize, Default)]
+pub struct CompactionStatistics {
+    delta_layer_visited: CompactionStatisticsNumSize,
+    image_layer_visited: CompactionStatisticsNumSize,
+    delta_layer_produced: CompactionStatisticsNumSize,
+    image_layer_produced: CompactionStatisticsNumSize,
+    num_delta_layer_discarded: usize,
+    num_image_layer_discarded: usize,
+    num_unique_keys_visited: usize,
+    wal_keys_visited: CompactionStatisticsNumSize,
+    image_keys_visited: CompactionStatisticsNumSize,
+    wal_produced: CompactionStatisticsNumSize,
+    image_produced: CompactionStatisticsNumSize,
+}
+
+impl CompactionStatistics {
+    fn estimated_size_of_value(val: &Value) -> usize {
+        match val {
+            Value::Image(img) => img.len(),
+            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
+            _ => std::mem::size_of::<NeonWalRecord>(),
+        }
+    }
+    fn estimated_size_of_key() -> usize {
+        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
+    }
+    fn visit_delta_layer(&mut self, size: u64) {
+        self.delta_layer_visited.num += 1;
+        self.delta_layer_visited.size += size;
+    }
+    fn visit_image_layer(&mut self, size: u64) {
+        self.image_layer_visited.num += 1;
+        self.image_layer_visited.size += size;
+    }
+    fn on_unique_key_visited(&mut self) {
+        self.num_unique_keys_visited += 1;
+    }
+    fn visit_wal_key(&mut self, val: &Value) {
+        self.wal_keys_visited.num += 1;
+        self.wal_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn visit_image_key(&mut self, val: &Value) {
+        self.image_keys_visited.num += 1;
+        self.image_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_key(&mut self, val: &Value) {
+        match val {
+            Value::Image(img) => self.produce_image_key(img),
+            Value::WalRecord(_) => self.produce_wal_key(val),
+        }
+    }
+    fn produce_wal_key(&mut self, val: &Value) {
+        self.wal_produced.num += 1;
+        self.wal_produced.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_image_key(&mut self, val: &Bytes) {
+        self.image_produced.num += 1;
+        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn discard_delta_layer(&mut self) {
+        self.num_delta_layer_discarded += 1;
+    }
+    fn discard_image_layer(&mut self) {
+        self.num_image_layer_discarded += 1;
+    }
+    fn produce_delta_layer(&mut self, size: u64) {
+        self.delta_layer_produced.num += 1;
+        self.delta_layer_produced.size += size;
+    }
+    fn produce_image_layer(&mut self, size: u64) {
+        self.image_layer_produced.num += 1;
+        self.image_layer_produced.size += size;
+    }
+}
+
 impl Timeline {
    /// TODO: cancellation
    ///
@@ -118,12 +210,18 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, CompactionError> {
        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, ctx)
+            self.compact_with_gc(cancel, flags, ctx)
                .await
                .map_err(CompactionError::Other)?;
            return Ok(false);
        }

+        if flags.contains(CompactFlags::DryRun) {
+            return Err(CompactionError::Other(anyhow!(
+                "dry-run mode is not supported for legacy compaction for now"
+            )));
+        }
+
        // High level strategy for compaction / image creation:
        //
        // 1. First, calculate the desired "partitioning" of the
@@ -1644,6 +1742,7 @@ impl Timeline {
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
        cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        use std::collections::BTreeSet;
@@ -1667,12 +1766,16 @@ impl Timeline {
        )
        .await?;

-        info!("running enhanced gc bottom-most compaction");
+        let dry_run = flags.contains(CompactFlags::DryRun);
+
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
        };

+        let mut stat = CompactionStatistics::default();
+
        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -1743,6 +1846,9 @@ impl Timeline {
                let key_range = desc.get_key_range();
                delta_split_points.insert(key_range.start);
                delta_split_points.insert(key_range.end);
+                stat.visit_delta_layer(desc.file_size());
+            } else {
+                stat.visit_image_layer(desc.file_size());
            }
        }
        let mut delta_layers = Vec::new();
@@ -1778,6 +1884,8 @@ impl Timeline {
            tline: &Arc<Timeline>,
            lowest_retain_lsn: Lsn,
            ctx: &RequestContext,
+            stats: &mut CompactionStatistics,
+            dry_run: bool,
            last_batch: bool,
        ) -> anyhow::Result<Option<FlushDeltaResult>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1834,6 +1942,7 @@ impl Timeline {
                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
                    drop(guard);
                    if layer_generation == tline.generation {
+                        stats.discard_delta_layer();
                        // TODO: depending on whether we design this compaction process to run along with
                        // other compactions, there could be layer map modifications after we drop the
                        // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1861,6 +1970,11 @@ impl Timeline {
                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
            }

+            stats.produce_delta_layer(delta_layer_writer.size());
+            if dry_run {
+                return Ok(None);
+            }
+
            let (desc, path) = delta_layer_writer
                .finish(delta_key.key_range.end, ctx)
                .await?;
@@ -1956,6 +2070,13 @@ impl Timeline {
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
        while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if cancel.is_cancelled() {
+                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+            }
+            match val {
+                Value::Image(_) => stat.visit_image_key(&val),
+                Value::WalRecord(_) => stat.visit_wal_key(&val),
+            }
            if last_key.is_none() || last_key.as_ref() == Some(&key) {
                if last_key.is_none() {
                    last_key = Some(key);
@@ -1963,6 +2084,7 @@ impl Timeline {
                accumulated_values.push((key, lsn, val));
            } else {
                let last_key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited();
                let retention = self
                    .generate_key_retention(
                        *last_key,
@@ -1979,6 +2101,7 @@ impl Timeline {
                        *last_key,
                        &mut delta_values,
                        image_layer_writer.as_mut(),
+                        &mut stat,
                        ctx,
                    )
                    .await?;
@@ -1991,6 +2114,8 @@ impl Timeline {
                        self,
                        lowest_retain_lsn,
                        ctx,
+                        &mut stat,
+                        dry_run,
                        false,
                    )
                    .await?,
@@ -2003,6 +2128,7 @@ impl Timeline {

        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
+        stat.on_unique_key_visited();
        let retention = self
            .generate_key_retention(
                last_key,
@@ -2019,6 +2145,7 @@ impl Timeline {
                last_key,
                &mut delta_values,
                image_layer_writer.as_mut(),
+                &mut stat,
                ctx,
            )
            .await?;
@@ -2031,6 +2158,8 @@ impl Timeline {
                self,
                lowest_retain_lsn,
                ctx,
+                &mut stat,
+                dry_run,
                true,
            )
            .await?,
@@ -2038,12 +2167,28 @@ impl Timeline {
        assert!(delta_values.is_empty(), "unprocessed keys");

        let image_layer = if discard_image_layer {
+            stat.discard_image_layer();
            None
        } else if let Some(writer) = image_layer_writer {
-            Some(writer.finish(self, ctx).await?)
+            stat.produce_image_layer(writer.size());
+            if !dry_run {
+                Some(writer.finish(self, ctx).await?)
+            } else {
+                None
+            }
        } else {
            None
        };
+
+        info!(
+            "gc-compaction statistics: {}",
+            serde_json::to_string(&stat)?
+        );
+
+        if dry_run {
+            return Ok(());
+        }
+
        info!(
            "produced {} delta layers and {} image layers",
            delta_layers.len(),
@@ -2067,6 +2212,7 @@ impl Timeline {
        let mut layer_selection = layer_selection;
        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
        compact_to.extend(image_layer);
+
        // Step 3: Place back to the layer map.
        {
            let mut guard = self.layers.write().await;
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -230,6 +230,8 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

+        tenant.gc_block.before_delete(&timeline);
+
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,7 +225,7 @@ impl Timeline {
                    continue;
                }

-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();

                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,967 +0,0 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
-//!
-//! # Motivation
-//!
-//! On a single page service connection, we're typically serving a single TenantTimelineId.
-//!
-//! Without sharding, there is a single Timeline object to which we dispatch
-//! all requests. For example, a getpage request gets dispatched to the
-//! Timeline::get method of the Timeline object that represents the
-//! (tenant,timeline) of that connection.
-//!
-//! With sharding, for each request that comes in on the connection,
-//! we first have to perform shard routing based on the requested key (=~ page number).
-//! The result of shard routing is a Timeline object.
-//! We then dispatch the request to that Timeline object.
-//!
-//! Regardless of whether the tenant is sharded or not, we want to ensure that
-//! we hold the Timeline gate open while we're invoking the method on the
-//! Timeline object.
-//!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
-//!
-//! Regardless of how we accomplish the above, it should not
-//! prevent the Timeline from shutting down promptly.
-//!
-//! # Design
-//!
-//! There are three user-facing data structures:
-//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
-//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
-//!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
-//!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
-//!
-//! To dispatch a request, the page service connection calls `Cache::get`.
-//!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
-//!
-//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
-//!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
-//!
-//! # Memory Management / How The Reference Cycle Is Broken
-//!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
-//!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
-//!
-//! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
-//!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
-//! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
-//!
-//! # Fast Path for Shard Routing
-//!
-//! The `Cache` has a fast path for shard routing to avoid calling into
-//! the tenant manager for every request.
-//!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
-//!
-//! The current implementation uses the first entry in the hash map
-//! to determine the `ShardParameters` and derive the correct
-//! `ShardIndex` for the requested key.
-//!
-//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
-//!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
-//! it's a hit.
-//!
-//! ## Cache invalidation
-//!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
-//! The only reasons why an entry in the cache can become stale are:
-//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
-//!    being detached, timeline or shard deleted, or pageserver is shutting down.
-//! 2. We're doing a shard split and new traffic should be routed to the child shards.
-//!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
-//! timeline has shut down, and when that happens, we remove the entry from the cache.
-//!
-//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
-//! to the parent shard during a shard split. Eventually, the shard split task will
-//! shut down the parent => case (1).
-
-use std::collections::hash_map;
-use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::sync::Weak;
-
-use pageserver_api::shard::ShardIdentity;
-use tracing::instrument;
-use tracing::trace;
-use utils::id::TimelineId;
-use utils::shard::ShardIndex;
-use utils::shard::ShardNumber;
-
-use crate::tenant::mgr::ShardSelector;
-
-/// The requirement for Debug is so that #[derive(Debug)] works in some places.
-pub(crate) trait Types: Sized + std::fmt::Debug {
-    type TenantManagerError: Sized + std::fmt::Debug;
-    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
-}
-
-/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
-/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
-/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
-struct CacheId(u64);
-
-impl CacheId {
-    fn next() -> Self {
-        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("CacheId::new() returned 0, overflow");
-        }
-        Self(id)
-    }
-}
-
-/// See module-level comment.
-pub(crate) struct Cache<T: Types> {
-    id: CacheId,
-    map: Map<T>,
-}
-
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
-
-impl<T: Types> Default for Cache<T> {
-    fn default() -> Self {
-        Self {
-            id: CacheId::next(),
-            map: Default::default(),
-        }
-    }
-}
-
-#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
-pub(crate) struct ShardTimelineId {
-    pub(crate) shard_index: ShardIndex,
-    pub(crate) timeline_id: TimelineId,
-}
-
-/// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
-///
-/// See module-level comment for details.
-pub struct PerTimelineState<T: Types> {
-    // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
-}
-
-impl<T: Types> Default for PerTimelineState<T> {
-    fn default() -> Self {
-        Self {
-            handles: Mutex::new(Some(Default::default())),
-        }
-    }
-}
-
-/// Abstract view of [`crate::tenant::mgr`], for testability.
-pub(crate) trait TenantManager<T: Types> {
-    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
-    /// Errors are returned as [`GetError::TenantManager`].
-    async fn resolve(
-        &self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> Result<T::Timeline, T::TenantManagerError>;
-}
-
-/// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
-    fn shard_timeline_id(&self) -> ShardTimelineId;
-    fn get_shard_identity(&self) -> &ShardIdentity;
-    fn per_timeline_state(&self) -> &PerTimelineState<T>;
-}
-
-/// Errors returned by [`Cache::get`].
-#[derive(Debug)]
-pub(crate) enum GetError<T: Types> {
-    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
-    PerTimelineStateShutDown,
-}
-
-/// Internal type used in [`Cache::get`].
-enum RoutingResult<T: Types> {
-    FastPath(Handle<T>),
-    SlowPath(ShardTimelineId),
-    NeedConsultTenantManager,
-}
-
-impl<T: Types> Cache<T> {
-    /// See module-level comment for details.
-    ///
-    /// Does NOT check for the shutdown state of [`Types::Timeline`].
-    /// Instead, the methods of [`Types::Timeline`] that are invoked through
-    /// the [`Handle`] are responsible for checking these conditions
-    /// and if so, return an error that causes the page service to
-    /// close the connection.
-    #[instrument(level = "trace", skip_all)]
-    pub(crate) async fn get(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
-            let routing_state = self.shard_routing(timeline_id, shard_selector);
-            match routing_state {
-                RoutingResult::FastPath(handle) => return Ok(handle),
-                RoutingResult::SlowPath(key) => match self.map.get(&key) {
-                    Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
-                            trace!("handle cache stale");
-                            self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
-                        }
-                    },
-                    None => ShardSelector::Known(key.shard_index),
-                },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
-            }
-        };
-        self.get_miss(timeline_id, miss, tenant_manager).await
-    }
-
-    #[inline(always)]
-    fn shard_routing(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> RoutingResult<T> {
-        loop {
-            // terminates because when every iteration we remove an element from the map
-            let Some((first_key, first_handle)) = self.map.iter().next() else {
-                return RoutingResult::NeedConsultTenantManager;
-            };
-            let Some(first_handle) = first_handle.upgrade() else {
-                // TODO: dedup with get()
-                trace!("handle cache stale");
-                let first_key_owned = *first_key;
-                self.map.remove(&first_key_owned).unwrap();
-                continue;
-            };
-
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
-            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_handle_shard_identity.count,
-            };
-
-            let need_idx = match shard_selector {
-                ShardSelector::Page(key) => {
-                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
-                }
-                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
-                ShardSelector::Known(shard_idx) => shard_idx,
-            };
-            let need_shard_timeline_id = ShardTimelineId {
-                shard_index: need_idx,
-                timeline_id,
-            };
-            let first_handle_shard_timeline_id = ShardTimelineId {
-                shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
-            };
-
-            if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
-            } else {
-                return RoutingResult::SlowPath(need_shard_timeline_id);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    #[inline(always)]
-    async fn get_miss(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        match tenant_manager.resolve(timeline_id, shard_selector).await {
-            Ok(timeline) => {
-                let key = timeline.shard_timeline_id();
-                match &shard_selector {
-                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
-                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
-                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
-                }
-
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
-                trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
-                    let mut lock_guard = timeline
-                        .per_timeline_state()
-                        .handles
-                        .lock()
-                        .expect("mutex poisoned");
-                    match &mut *lock_guard {
-                        Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
-                            assert!(replaced.is_none(), "some earlier code left a stale handle");
-                            match self.map.entry(key) {
-                                hash_map::Entry::Occupied(_o) => {
-                                    // This cannot not happen because
-                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
-                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
-                                    //    while we were waiting for the tenant manager.
-                                    unreachable!()
-                                }
-                                hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
-                                }
-                            }
-                        }
-                        None => {
-                            return Err(GetError::PerTimelineStateShutDown);
-                        }
-                    }
-                };
-                Ok(Handle(handle))
-            }
-            Err(e) => Err(GetError::TenantManager(e)),
-        }
-    }
-}
-
-impl<T: Types> PerTimelineState<T> {
-    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
-    /// to the [`Types::Timeline`] that embeds this per-timeline state.
-    /// Even if [`TenantManager::resolve`] would still resolve to it.
-    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
-    /// That's ok because they're short-lived. See module-level comment for details.
-    #[instrument(level = "trace", skip_all)]
-    pub(super) fn shutdown(&self) {
-        let handles = self
-            .handles
-            .lock()
-            .expect("mutex poisoned")
-            // NB: this .take() sets locked to None.
-            // That's what makes future `Cache::get` misses fail.
-            // Cache hits are taken care of below.
-            .take();
-        let Some(handles) = handles else {
-            trace!("already shut down");
-            return;
-        };
-        for handle in handles.values() {
-            // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
-        }
-        drop(handles);
-    }
-}
-
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
-// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
-impl<T: Types> Drop for Cache<T> {
-    fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver_api::{
-        key::{rel_block_to_key, Key, DBDIR_KEY},
-        models::ShardParameters,
-        reltag::RelTag,
-        shard::ShardStripeSize,
-    };
-    use utils::shard::ShardCount;
-
-    use super::*;
-
-    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
-
-    #[derive(Debug)]
-    struct TestTypes;
-    impl Types for TestTypes {
-        type TenantManagerError = anyhow::Error;
-        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
-    }
-
-    struct StubManager {
-        shards: Vec<Arc<StubTimeline>>,
-    }
-
-    struct StubTimeline {
-        gate: utils::sync::gate::Gate,
-        id: TimelineId,
-        shard: ShardIdentity,
-        per_timeline_state: PerTimelineState<TestTypes>,
-        myself: Weak<StubTimeline>,
-    }
-
-    impl StubTimeline {
-        fn getpage(&self) {
-            // do nothing
-        }
-    }
-
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
-        fn shard_timeline_id(&self) -> ShardTimelineId {
-            ShardTimelineId {
-                shard_index: self.shard.shard_index(),
-                timeline_id: self.id,
-            }
-        }
-
-        fn get_shard_identity(&self) -> &ShardIdentity {
-            &self.shard
-        }
-
-        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
-            &self.per_timeline_state
-        }
-    }
-
-    impl TenantManager<TestTypes> for StubManager {
-        async fn resolve(
-            &self,
-            timeline_id: TimelineId,
-            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
-            for timeline in &self.shards {
-                if timeline.id == timeline_id {
-                    match &shard_selector {
-                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Zero => continue,
-                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Page(_) => continue,
-                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Known(_) => continue,
-                    }
-                }
-            }
-            anyhow::bail!("not found")
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_timeline_shutdown() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        //
-        // fill the cache
-        //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        let handle: Handle<_> = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
-        assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-        drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-
-        //
-        // demonstrate that Handle holds up gate closure
-        // but shutdown prevents new handles from being handed out
-        //
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("cache and per-timeline handler state keep cache open");
-            }
-            _ = tokio::time::sleep(FOREVER) => {
-                // NB: first poll of close() makes it enter closing state
-            }
-        }
-
-        let handle = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-
-        // SHUTDOWN
-        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
-
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
-        assert_eq!(
-            cache.map.len(),
-            1,
-            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
-
-        // this handle is perfectly usable
-        handle.getpage();
-
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
-        assert_eq!(
-            cache.map.len(),
-            0,
-            "first access after shutdown cleans up the Weak's from the cache"
-        );
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-
-        drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        // closing gate succeeds after dropping handle
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-
-        // map gets cleaned on next lookup
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 0);
-
-        // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
-        drop(shard0);
-        drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
-    }
-
-    #[tokio::test]
-    async fn test_multiple_timelines_and_deletion() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_a = TimelineId::generate();
-        let timeline_b = TimelineId::generate();
-        assert_ne!(timeline_a, timeline_b);
-        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_a,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_b,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mut mgr = StubManager {
-            shards: vec![timeline_a.clone(), timeline_b.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        assert_eq!(cache.map.len(), 2);
-
-        // delete timeline A
-        timeline_a.per_timeline_state.shutdown();
-        mgr.shards.retain(|t| t.id != timeline_a.id);
-        assert!(
-            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
-                .await
-                .is_err(),
-            "broken StubManager implementation"
-        );
-
-        assert_eq!(
-            cache.map.len(),
-            2,
-            "cache still has a Weak handle to Timeline A"
-        );
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
-
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we still have it");
-    }
-
-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
-        rel_block_to_key(
-            RelTag {
-                spcnode: 1663,
-                dbnode: 208101,
-                relnode: 2620,
-                forknum: 0,
-            },
-            shard.0 as u32 * params.stripe_size.0,
-        )
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_shard_split() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let parent = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_params = ShardParameters {
-            count: ShardCount(2),
-            stripe_size: ShardStripeSize::default(),
-        };
-        let child0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child1 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        // fill the cache with the parent
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![parent.clone()],
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent first"
-            );
-            drop(handle);
-        }
-
-        //
-        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
-        //
-
-        // while we haven't shut down the parent, the cache will return the cached parent, even
-        // if the tenant manager returns the child
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent"
-            );
-            drop(handle);
-        }
-
-        let parent_handle = cache
-            .get(
-                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
-                &StubManager {
-                    shards: vec![parent.clone()],
-                },
-            )
-            .await
-            .expect("we have it");
-        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
-
-        // invalidate the cache
-        parent.per_timeline_state.shutdown();
-
-        // the cache will now return the child, even though the parent handle still exists
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(
-                    &handle.myself,
-                    &child_shards_by_shard_number[i as usize].myself
-                ),
-                "mgr returns child"
-            );
-            drop(handle);
-        }
-
-        // all the while the parent handle kept the parent gate open
-        tokio::select! {
-            _ = parent_handle.gate.close() => {
-                panic!("parent handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-        drop(parent_handle);
-        tokio::select! {
-            _ = parent.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("parent handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_connection_handler_exit() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
-        for _ in 0..10 {
-            let mut cache = Cache::<TestTypes>::default();
-            let handle = {
-                let handle = cache
-                    .get(timeline_id, ShardSelector::Page(key), &mgr)
-                    .await
-                    .expect("we have the timeline");
-                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-                handle
-            };
-            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
-        }
-
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -259,13 +259,10 @@ impl LayerManager {
                new_layer.layer_desc().lsn_range
            );

-            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
            // always marking rewritten layers as visible.
-            new_layer
-                .as_ref()
-                .access_stats()
-                .set_visibility(old_layer.access_stats().visibility());
+            new_layer.as_ref().set_visibility(old_layer.visibility());

            // Safety: we may never rewrite the same file in-place.  Callers are responsible
            // for ensuring that they only rewrite layers after something changes the path,
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,6 +122,10 @@ impl CurrentLogicalSize {
            Self::Exact(_) => Accuracy::Exact,
        }
    }
+
+    pub(crate) fn is_exact(&self) -> bool {
+        matches!(self, Self::Exact(_))
+    }
 }

 impl LogicalSize {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    pgdatadir_mapping::DatadirModification,
-    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
@@ -342,10 +342,7 @@ pub(super) async fn handle_walreceiver_connection(
                        // Commit every ingest_batch_size records. Even if we filtered out
                        // all records, we still need to call commit to advance the LSN.
                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size
-                            || modification.approx_pending_bytes()
-                                > DatadirModification::MAX_PENDING_BYTES
-                        {
+                        if uncommitted_records >= ingest_batch_size {
                            WAL_INGEST
                                .records_committed
                                .inc_by(uncommitted_records - filtered_records);
--- a/poetry.lock
+++ b/poetry.lock
@@ -1514,6 +1514,20 @@ files = [
 [package.dependencies]
 six = "*"

+[[package]]
+name = "kafka-python"
+version = "2.0.2"
+description = "Pure Python client for Apache Kafka"
+optional = false
+python-versions = "*"
+files = [
+    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
+    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
+]
+
+[package.extras]
+crc32c = ["crc32c"]
+
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3357,4 +3371,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
+content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,7 @@
 [tool.poetry]
-name = "neon"
-version = "0.1.0"
 description = ""
 authors = []
+package-mode = false

 [tool.poetry.dependencies]
 python = "^3.9"
@@ -42,6 +41,7 @@ httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
+kafka-python = "^2.0.2"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -75,6 +75,7 @@ module = [
    "allure.*",
    "allure_commons.*",
    "allure_pytest.*",
+    "kafka.*",
 ]
 ignore_missing_imports = true

--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -642,8 +642,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    logging::replace_panic_hook_with_tracing_panic_hook().forget();
    // initialize sentry if SENTRY_DSN is provided
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-    info!("version: {GIT_VERSION}");
-    info!("build_tag: {BUILD_TAG}");
+    info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}");
    metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // On any shutdown signal, log receival and exit.
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -92,7 +92,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                            .push(format!("index_part.json version: {}", index_part.version()))
                    }

-                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
+                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
                    if !newest_versions.any(|ip| ip == &index_part.version()) {
                        info!(
                            "index_part.json version is not latest: {}",
@@ -172,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                    }
                }
                BlobDataParseResult::Relic => {}
-                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
-                    parse_errors
+                BlobDataParseResult::Incorrect {
+                    errors,
+                    s3_layers: _,
+                } => result.errors.extend(
+                    errors
                        .into_iter()
                        .map(|error| format!("parse error: {error}")),
                ),
@@ -300,7 +303,10 @@ pub(crate) enum BlobDataParseResult {
    },
    /// The remains of a deleted Timeline (i.e. an initdb archive only)
    Relic,
-    Incorrect(Vec<String>),
+    Incorrect {
+        errors: Vec<String>,
+        s3_layers: HashSet<(LayerName, Generation)>,
+    },
 }

 pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
@@ -443,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
    }

    Ok(S3TimelineBlobData {
-        blob_data: BlobDataParseResult::Incorrect(errors),
+        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
        unused_index_keys: index_part_keys,
        unknown_keys,
    })
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -208,21 +208,21 @@ async fn main() -> anyhow::Result<()> {
                        }

                        if summary.is_fatal() {
-                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                            tracing::error!("Fatal scrub errors detected");
                        } else if summary.is_empty() {
                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                            // scrubber they were likely expecting to scan something, and if we see no timelines
                            // at all then it's likely due to some configuration issues like a bad prefix
-                            Err(anyhow::anyhow!(
+                            tracing::error!(
                                "No timelines found in bucket {} prefix {}",
                                bucket_config.bucket,
                                bucket_config
                                    .prefix_in_bucket
                                    .unwrap_or("<none>".to_string())
-                            ))
-                        } else {
-                            Ok(())
+                            );
                        }
+
+                        Ok(())
                    }
                }
            }
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -4,7 +4,7 @@ use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use futures::StreamExt;
-use remote_storage::{GenericRemoteStorage, ListingMode};
+use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;

 use crate::{
@@ -276,3 +276,33 @@ pub(crate) fn stream_listing<'a>(
        }
    }
 }
+
+pub(crate) fn stream_listing_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a S3Target,
+) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
+    let listing_mode = if target.delimiter.is_empty() {
+        ListingMode::NoDelimiter
+    } else {
+        ListingMode::WithDelimiter
+    };
+    try_stream! {
+        let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
+            remote_client,
+            listing_mode,
+            target,
+        ));
+        while let Some(list) = objects_stream.next().await {
+            let list = list?;
+            if target.delimiter.is_empty() {
+                for key in list.keys {
+                    yield (key.key.clone(), Some(key));
+                }
+            } else {
+                for key in list.prefixes {
+                    yield (key, None);
+                }
+            }
+        }
+    }
+}
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -389,10 +389,13 @@ async fn gc_ancestor(
                // Post-deletion tenant location: don't try and GC it.
                continue;
            }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _, // TODO(yuchen): could still check references to these s3 layers?
+            } => {
                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
                tracing::warn!(
-                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}"
                );
                continue;
            }
@@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc(
                // Post-deletion tenant location: don't try and GC it.
                return Ok(summary);
            }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _,
+            } => {
                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
+                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
                return Ok(summary);
            }
        };
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -290,13 +290,21 @@ pub async fn scan_metadata(
            }
        }

-        if let BlobDataParseResult::Parsed {
-            index_part: _index_part,
-            index_part_generation: _index_part_generation,
-            s3_layers,
-        } = &data.blob_data
-        {
-            tenant_objects.push(ttid, s3_layers.clone());
+        match &data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation: _index_part_generation,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
+            BlobDataParseResult::Relic => (),
+            BlobDataParseResult::Incorrect {
+                errors: _,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
        }
        tenant_timeline_results.push((ttid, data));
    }
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,10 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};

-use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
+use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{error, info, trace};
@@ -14,8 +14,9 @@ use utils::{
 };

 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
+    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
+    TenantShardTimelineId,
 };

 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -106,7 +107,7 @@ pub async fn scan_safekeeper_metadata(
    let timelines = client.query(&query, &[]).await?;
    info!("loaded {} timelines", timelines.len());

-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
    let console_config = ConsoleConfig::from_env()?;
    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);

@@ -119,7 +120,7 @@ pub async fn scan_safekeeper_metadata(
        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
        check_timeline(
-            &s3_client,
+            &remote_client,
            &target,
            &cloud_admin_api_client,
            ttid,
@@ -156,7 +157,7 @@ struct TimelineCheckResult {
 /// errors are logged to stderr; returns Ok(true) if timeline is consistent,
 /// Ok(false) if not, Err if failed to check.
 async fn check_timeline(
-    s3_client: &Client,
+    remote_client: &GenericRemoteStorage,
    root: &RootTarget,
    api_client: &CloudAdminApiClient,
    ttid: TenantTimelineId,
@@ -187,12 +188,13 @@ async fn check_timeline(
    // we need files, so unset it.
    timeline_dir_target.delimiter = String::new();

-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+        let (key, _obj) = obj?;

        let seg_name = key
+            .get_path()
+            .as_str()
            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
            .expect("failed to extract segment name");
        expected_segfiles.remove(seg_name);
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -269,7 +269,7 @@ impl SnapshotDownloader {
                        .context("Downloading timeline")?;
                    }
                    BlobDataParseResult::Relic => {}
-                    BlobDataParseResult::Incorrect(_) => {
+                    BlobDataParseResult::Incorrect { .. } => {
                        tracing::error!("Bad metadata in timeline {ttid}");
                    }
                };
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -285,9 +285,9 @@ class NeonApiEndpoint:
            self.project_id = project_id
            eps = neon_api.get_endpoints(project_id)["endpoints"]
            self.endpoint_id = eps[0]["id"]
-            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
-                "uri"
-            ]
+            self.connstr = neon_api.get_connection_uri(
+                project_id, endpoint_id=self.endpoint_id, pooled=False
+            )["uri"]
            pw = self.connstr.split("@")[0].split(":")[-1]
            self.pgbench_env = {
                "PGHOST": eps[0]["host"],
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -978,7 +978,10 @@ class NeonEnvBuilder:
                and self.enable_scrub_on_exit
            ):
                try:
-                    self.env.storage_scrubber.scan_metadata()
+                    healthy, _ = self.env.storage_scrubber.scan_metadata()
+                    if not healthy:
+                        e = Exception("Remote storage metadata corrupted")
+                        cleanup_error = e
                except Exception as e:
                    log.error(f"Error during remote storage scrub: {e}")
                    cleanup_error = e
@@ -4411,14 +4414,19 @@ class StorageScrubber:
        assert stdout is not None
        return stdout

-    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
+        """
+        Returns the health status and the metadata summary.
+        """
        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
        if post_to_storage_controller:
            args.append("--post")
        stdout = self.scrubber_cli(args, timeout=30)

        try:
-            return json.loads(stdout)
+            summary = json.loads(stdout)
+            healthy = not summary["with_errors"] and not summary["with_warnings"]
+            return healthy, summary
        except:
            log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
            log.error(stdout)
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -61,6 +61,7 @@ class HistoricLayerInfo:
    remote: bool
    # None for image layers, true if pageserver thinks this is an L0 delta layer
    l0: Optional[bool]
+    visible: bool

    @classmethod
    def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
@@ -79,6 +80,7 @@ class HistoricLayerInfo:
            lsn_end=d.get("lsn_end"),
            remote=d["remote"],
            l0=l0_ness,
+            visible=d["access_stats"]["visible"],
        )


@@ -556,6 +558,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        assert isinstance(res_json, dict)
        return res_json

+    def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
+    def timeline_unblock_gc(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
    def timeline_compact(
        self,
        tenant_id: Union[TenantId, TenantShardId],
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -389,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet")


 def wait_until(
-    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
+    number_of_iterations: int,
+    interval: float,
+    func: Callable[[], WaitUntilRet],
+    show_intermediate_error=False,
 ) -> WaitUntilRet:
    """
    Wait until 'func' returns successfully, without exception. Returns the
@@ -402,6 +405,8 @@ def wait_until(
        except Exception as e:
            log.info("waiting for %s iteration %s failed", func, i + 1)
            last_exception = e
+            if show_intermediate_error:
+                log.info(e)
            time.sleep(interval)
            continue
        return res
--- a/test_runner/logical_repl/README.md
+++ b/test_runner/logical_repl/README.md
@@ -0,0 +1,22 @@
+# Logical replication tests
+
+## Clickhouse
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f clickhouse/docker-compose.yml up -d
+pytest -m remote_cluster -k test_clickhouse
+docker compose -f clickhouse/docker-compose.yml down
+```
+
+## Debezium
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f debezium/docker-compose.yml up -d
+pytest -m remote_cluster -k test_debezium
+docker compose -f debezium/docker-compose.yml down
+
+```
--- a/test_runner/logical_repl/clickhouse/docker-compose.yml
+++ b/test_runner/logical_repl/clickhouse/docker-compose.yml
@@ -0,0 +1,9 @@
+services:
+  clickhouse:
+    image: clickhouse/clickhouse-server
+    user: "101:101"
+    container_name: clickhouse
+    hostname: clickhouse
+    ports:
+      - 127.0.0.1:8123:8123
+      - 127.0.0.1:9000:9000
--- a/test_runner/logical_repl/debezium/docker-compose.yml
+++ b/test_runner/logical_repl/debezium/docker-compose.yml
@@ -0,0 +1,24 @@
+services:
+  zookeeper:
+    image: quay.io/debezium/zookeeper:2.7
+  kafka:
+    image: quay.io/debezium/kafka:2.7
+    environment:
+      ZOOKEEPER_CONNECT: "zookeeper:2181"
+      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+      KAFKA_BROKER_ID: 1
+      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+      KAFKA_JMX_PORT: 9991
+    ports:
+      - 127.0.0.1:9092:9092
+  debezium:
+    image: quay.io/debezium/connect:2.7
+    environment:
+      BOOTSTRAP_SERVERS: kafka:9092
+      GROUP_ID: 1
+      CONFIG_STORAGE_TOPIC: debezium-config
+      OFFSET_STORAGE_TOPIC: debezium-offset
+      STATUS_STORAGE_TOPIC: debezium-status
+      DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+    ports:
+      - 127.0.0.1:8083:8083
--- a/test_runner/logical_repl/test_clickhouse.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -1,8 +1,9 @@
 """
-Test the logical replication in Neon with the different consumers
+Test the logical replication in Neon with ClickHouse as a consumer
 """

 import hashlib
+import os
 import time

 import clickhouse_connect
@@ -39,22 +40,15 @@ def test_clickhouse(remote_pg: RemotePostgres):
    """
    Test the logical replication having ClickHouse as a client
    """
+    clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1"
    conn_options = remote_pg.conn_options()
-    for _ in range(5):
-        try:
-            conn = psycopg2.connect(remote_pg.connstr())
-        except psycopg2.OperationalError as perr:
-            log.debug(perr)
-            time.sleep(1)
-        else:
-            break
-        raise TimeoutError
+    conn = psycopg2.connect(remote_pg.connstr())
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS table1")
    cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
    cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
    conn.commit()
-    client = clickhouse_connect.get_client(host="clickhouse")
+    client = clickhouse_connect.get_client(host=clickhouse_host)
    client.command("SET allow_experimental_database_materialized_postgresql=1")
    client.command(
        "CREATE DATABASE db1_postgres ENGINE = "
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -0,0 +1,189 @@
+"""
+Test the logical replication in Neon with Debezium as a consumer
+"""
+
+import json
+import os
+import time
+
+import psycopg2
+import pytest
+import requests
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.utils import wait_until
+from kafka import KafkaConsumer
+
+
+class DebeziumAPI:
+    """
+    The class for Debezium API calls
+    """
+
+    def __init__(self):
+        self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1"
+        self.__base_url = f"http://{self.__host}:8083"
+        self.__connectors_url = f"{self.__base_url}/connectors"
+
+    def __request(self, method, addurl="", **kwargs):
+        return requests.request(
+            method,
+            self.__connectors_url + addurl,
+            headers={"Accept": "application/json", "Content-type": "application/json"},
+            timeout=60,
+            **kwargs,
+        )
+
+    def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str):
+        """
+        Create a Postgres connector in debezium
+        """
+        conn_options = remote_pg.conn_options()
+        payload = {
+            "name": dbz_conn_name,
+            "config": {
+                "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
+                "tasks.max": "1",
+                "database.hostname": conn_options["host"],
+                "database.port": "5432",
+                "database.user": conn_options["user"],
+                "database.password": conn_options["password"],
+                "database.dbname": conn_options["dbname"],
+                "plugin.name": "pgoutput",
+                "topic.prefix": "dbserver1",
+                "schema.include.list": "inventory",
+            },
+        }
+        return self.__request("POST", json=payload)
+
+    def list_connectors(self):
+        """
+        Returns a list of all connectors existent in Debezium.
+        """
+        resp = self.__request("GET")
+        assert resp.ok
+        return json.loads(resp.text)
+
+    def del_connector(self, connector):
+        """
+        Deletes the specified connector
+        """
+        return self.__request("DELETE", f"/{connector}")
+
+
+@pytest.fixture(scope="function")
+def debezium(remote_pg: RemotePostgres):
+    """
+    Prepare the Debezium API handler, connection
+    """
+    conn = psycopg2.connect(remote_pg.connstr())
+    cur = conn.cursor()
+    cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE")
+    cur.execute("CREATE SCHEMA inventory")
+    cur.execute(
+        "CREATE TABLE inventory.customers ("
+        "id SERIAL NOT NULL PRIMARY KEY,"
+        "first_name character varying(255) NOT NULL,"
+        "last_name character varying(255) NOT NULL,"
+        "email character varying(255) NOT NULL)"
+    )
+    conn.commit()
+    dbz = DebeziumAPI()
+    assert len(dbz.list_connectors()) == 0
+    dbz_conn_name = "inventory-connector"
+    resp = dbz.create_pg_connector(remote_pg, dbz_conn_name)
+    log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
+    assert resp.status_code == 201
+    assert len(dbz.list_connectors()) == 1
+    consumer = KafkaConsumer(
+        "dbserver1.inventory.customers",
+        bootstrap_servers=["kafka:9092"],
+        auto_offset_reset="earliest",
+        enable_auto_commit=False,
+    )
+    yield conn, consumer
+    resp = dbz.del_connector(dbz_conn_name)
+    assert resp.status_code == 204
+
+
+def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None:
+    """
+    Gets the message from Kafka and checks its validity
+    Arguments:
+        consumer: the consumer object
+        ts_ms:    timestamp in milliseconds of the change of db, the corresponding message must have
+                  the later timestamp
+        before:   a dictionary, if not None, the before field from the kafka message must
+                  have the same values for the same keys
+        after:    a dictionary, if not None, the after field from the kafka message must
+                  have the same values for the same keys
+    """
+    msg = consumer.poll()
+    assert msg, "Empty message"
+    for val in msg.values():
+        r = json.loads(val[-1].value)
+        log.info(r["payload"])
+        assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp"
+        for param, pname in ((before, "before"), (after, "after")):
+            if param is not None:
+                for k, v in param.items():
+                    assert r["payload"][pname][k] == v, f"{pname} mismatches"
+
+
+@pytest.mark.remote_cluster
+def test_debezium(debezium):
+    """
+    Test the logical replication having Debezium as a subscriber
+    """
+    conn, consumer = debezium
+    cur = conn.cursor()
+    ts_ms = time.time() * 1000
+    log.info("Insert 1 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('John', 'Dow','johndow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Insert 2 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('Alex', 'Row','alexrow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Update ts_ms: %s", ts_ms)
+    cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alexander"},
+        ),
+        show_intermediate_error=True,
+    )
+    time.sleep(3)
+    cur.execute("select 1")
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,5 +1,6 @@
 from contextlib import closing

+import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
@@ -16,6 +17,7 @@ from fixtures.pg_version import PgVersion
 # 3. Disk space used
 # 4. Peak memory usage
 #
+@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124")
 def test_bulk_insert(neon_with_baseline: PgCompare):
    env = neon_with_baseline

--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -100,24 +100,32 @@ def test_subscriber_lag(
    pub_connstr = benchmark_project_pub.connstr
    sub_connstr = benchmark_project_sub.connstr

-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    if benchmark_project_pub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    if benchmark_project_sub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)

    pub_conn = psycopg2.connect(pub_connstr)
    sub_conn = psycopg2.connect(sub_connstr)
    pub_conn.autocommit = True
    sub_conn.autocommit = True
    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
-            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0

-        if benchmark_project_sub.is_new:
+        if not pub_exists:
+            pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history")
+
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
            sub_cur.execute("truncate table pgbench_accounts")
            sub_cur.execute("truncate table pgbench_history")

-            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+            sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1")

        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
    pub_conn.close()
    sub_conn.close()

@@ -195,10 +203,15 @@ def test_publisher_restart(
    pub_conn.autocommit = True
    sub_conn.autocommit = True
    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0
+
+        if not pub_exists:
            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")

-        if benchmark_project_sub.is_new:
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
            sub_cur.execute("truncate table pgbench_accounts")
            sub_cur.execute("truncate table pgbench_history")

--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -217,7 +217,11 @@ def test_storage_controller_many_tenants(
                # A reconciler operation: migrate a shard.
                shard_number = rng.randint(0, shard_count - 1)
                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
-                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+
+                # Migrate it to its secondary location
+                desc = env.storage_controller.tenant_describe(tenant_id)
+                dest_ps_id = desc["shards"][shard_number]["node_secondary"][0]
+
                f = executor.submit(
                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
                )
@@ -231,7 +235,11 @@ def test_storage_controller_many_tenants(
        for f in futs:
            f.result()

-    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    # Some of the operations above (notably migrations) might leave the controller in a state where it has
+    # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system
+    # to reach a quiescent state before doing following checks.
+    env.storage_controller.reconcile_until_idle()
+
    env.storage_controller.consistency_check()
    check_memory()

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -496,11 +496,10 @@ def test_historic_storage_formats(
    # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
    #
    # Do this _before_ importing to the pageserver, as that import may start writing immediately
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
+    assert healthy
    assert metadata_summary["tenant_count"] >= 1
    assert metadata_summary["timeline_count"] >= 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]

    env.neon_cli.import_tenant(dataset.tenant_id)

--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):

    # Having written a mixture of generation-aware and legacy index_part.json,
    # ensure the scrubber handles the situation as expected.
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
    assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
    assert metadata_summary["timeline_count"] == 1
    assert metadata_summary["timeline_shard_count"] == 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
+    assert healthy


 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -2,10 +2,11 @@ import json
 import os
 import random
 import time
-from typing import Any, Dict, Optional
+from pathlib import Path
+from typing import Any, Dict, Optional, Union

 import pytest
-from fixtures.common_types import TenantId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -437,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
    validate_heatmap(heatmap_second)


+def list_elegible_layers(
+    pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+) -> list[Path]:
+    """
+    The subset of layer filenames that are elegible for secondary download: at time of writing this
+    is all resident layers which are also visible.
+    """
+    candidates = pageserver.list_layers(tenant_id, timeline_id)
+
+    layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id)
+
+    # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other
+    visible_map = dict(
+        (f"{layer.layer_file_name}-v1-00000001", layer.visible)
+        for layer in layer_map.historic_layers
+    )
+
+    def is_visible(layer_file_name):
+        try:
+            return visible_map[str(layer_file_name)]
+        except KeyError:
+            # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
+            # matches what's on disk.
+            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            raise
+
+    return list(c for c in candidates if is_visible(c))
+
+
 def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    """
    Test the overall data flow in secondary mode:
@@ -491,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):

    ps_secondary.http_client().tenant_secondary_download(tenant_id)

-    assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
+    assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers(
        tenant_id, timeline_id
    )

@@ -509,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    ps_secondary.http_client().tenant_secondary_download(tenant_id)

    try:
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
    except:
        # Do a full listing of the secondary location on errors, to help debug of
        # https://github.com/neondatabase/neon/issues/6966
@@ -532,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    # ==================================================================
    try:
        log.info("Evicting a layer...")
-        layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
-        some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
+        layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0]
+        some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1]
        log.info(f"Victim layer: {layer_to_evict.name}")
        ps_attached.http_client().evict_layer(
            tenant_id, timeline_id, layer_name=layer_to_evict.name
@@ -551,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
        ps_secondary.http_client().tenant_secondary_download(tenant_id)

        assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
    except:
        # On assertion failures, log some details to help with debugging
        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
@@ -563,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    # Scrub the remote storage
    # ========================
    # This confirms that the scrubber isn't upset by the presence of the heatmap
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy

    # Detach secondary and delete tenant
    # ===================================
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -124,7 +124,8 @@ def test_sharding_smoke(

    # Check the scrubber isn't confused by sharded content, then disable
    # it during teardown because we'll have deleted by then
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy

    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
    assert_prefix_empty(
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -516,9 +516,8 @@ def test_scrubber_scan_pageserver_metadata(
    assert len(index.layer_metadata) > 0
    it = iter(index.layer_metadata.items())

-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
-    assert not scan_summary["with_warnings"]
-    assert not scan_summary["with_errors"]
+    healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    assert healthy

    assert env.storage_controller.metadata_health_is_healthy()

@@ -532,16 +531,18 @@ def test_scrubber_scan_pageserver_metadata(
    log.info(f"delete response: {delete_response}")

    # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
-    scan_summary = env.storage_scrubber.scan_metadata()
+    _, scan_summary = env.storage_scrubber.scan_metadata()
    log.info(f"{pprint.pformat(scan_summary)}")
    assert len(scan_summary["with_warnings"]) > 0

    assert env.storage_controller.metadata_health_is_healthy()

    # Now post to storage controller, expect seeing one unhealthy health record
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
    log.info(f"{pprint.pformat(scan_summary)}")
    assert len(scan_summary["with_warnings"]) > 0

    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
+
+    neon_env_builder.disable_scrub_on_exit()
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -341,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
    env.stop()

-    result = env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy

    env.start()
    ps_http = env.pageserver.http_client()
    ps_http.tenant_delete(tenant_id)
    env.stop()

-    env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -0,0 +1,67 @@
+import time
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.utils import wait_timeline_detail_404
+
+
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+    )
+    ps = env.pageserver
+    http = ps.http_client()
+
+    foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
+
+    gc_active_line = ".* gc_loop.*: [12] timelines need GC"
+    gc_skipped_line = ".* gc_loop.*: Skipping GC: .*"
+    init_gc_skipped = ".*: initialized with gc blocked.*"
+
+    tenant_before = http.tenant_status(env.initial_tenant)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line)
+
+    assert ps.log_contains(gc_skipped_line, offset) is None
+
+    http.timeline_block_gc(env.initial_tenant, foo_branch)
+
+    tenant_after = http.tenant_status(env.initial_tenant)
+    assert tenant_before != tenant_after
+    gc_blocking = tenant_after["gc_blocking"]
+    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    ps.restart()
+    ps.quiesce_tenants()
+
+    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # deletion unblocks gc
+    http.timeline_delete(env.initial_tenant, foo_branch)
+    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+    http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # removing the manual block also unblocks gc
+    http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+
+def wait_for_another_gc_round():
+    time.sleep(2)
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -936,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

+    # just make sure this doesn't hit an assertion
+    client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True)
+
    # load in some data
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    endpoint.safe_psql_many(
Author	SHA1	Message	Date
Alex Chi Z	933bb88694	Revert "refactor(page_service): Timeline gate guard holding + cancellation + shutdown (#8339 )" This reverts commit `4e3b70e308`.	2024-08-07 19:57:33 +08:00
Arpad Müller	00c981576a	Lower level for timeline cancellations during gc (#8626 ) Timeline cancellation running in parallel with gc yields error log lines like: ``` Gc failed 1 times, retrying in 2s: TimelineCancelled ``` They are completely harmless though and normal to occur. Therefore, only print those messages at an info level. Still print them at all so that we know what is going on if we focus on a single timeline.	2024-08-07 09:29:52 +02:00
Arpad Müller	c3f2240fbd	storage broker: only print one line for version and build tag in init (#8624 ) This makes it more consistent with pageserver and safekeeper. Also, it is easier to collect the two values into one data point.	2024-08-07 09:14:26 +02:00
Yuchen Liang	ed5724d79d	scrubber: clean up `scan_metadata` before prod (#8565 ) Part of #8128. ## Problem Currently, scrubber `scan_metadata` command will return with an error code if the metadata on remote storage is corrupted with fatal errors. To safely deploy this command in a cronjob, we want to differentiate between failures while running scrubber command and the erroneous metadata. At the same time, we also want our regression tests to catch corrupted metadata using the scrubber command. ## Summary of changes - Return with error code only when the scrubber command fails - Uses explicit checks on errors and warnings to determine metadata health in regression tests. Resolve conflict with `tenant-snapshot` command (after shard split): [`test_scrubber_tenant_snapshot`](https://github.com/neondatabase/neon/blob/yuchen/scrubber-scan-cleanup-before-prod/test_runner/regress/test_storage_scrubber.py#L23) failed before applying `422a8443dd` - When taking a snapshot, the old `index_part.json` in the unsharded tenant directory is not kept. - The current `list_timeline_blobs` implementation consider no `index_part.json` as a parse error. - During the scan, we are only analyzing shards with highest shard count, so we will not get a parse error. but we do need to add the layers to tenant object listing, otherwise we will get index is referencing a layer that is not in remote storage error. - Action: Add s3_layers from `list_timeline_blobs` regardless of parsing error Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-08-06 18:55:42 +01:00
John Spray	ca5390a89d	pageserver: add `bench_ingest` (#7409 ) ## Problem We lack a rust bench for the inmemory layer and delta layer write paths: it is useful to benchmark these components independent of postgres & WAL decoding. Related: https://github.com/neondatabase/neon/issues/8452 ## Summary of changes - Refactor DeltaLayerWriter to avoid carrying a Timeline, so that it can be cleanly tested + benched without a Tenant/Timeline test harness. It only needed the Timeline for building `Layer`, so this can be done in a separate step. - Add `bench_ingest`, which exercises a variety of workload "shapes" (big values, small values, sequential keys, random keys) - Include a small uncontroversial optimization: in `freeze`, only exhaustively walk values to assert ordering relative to end_lsn in debug mode. These benches are limited by drive performance on a lot of machines, but still useful as a local tool for iterating on CPU/memory improvements around this code path. Anecdotal measurements on Hetzner AX102 (Ryzen 7950xd): ``` ingest-small-values/ingest 128MB/100b seq time: [1.1160 s 1.1230 s 1.1289 s] thrpt: [113.38 MiB/s 113.98 MiB/s 114.70 MiB/s] Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) low mild Benchmarking ingest-small-values/ingest 128MB/100b rand: Warming up for 3.0000 s Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 18.9s. ingest-small-values/ingest 128MB/100b rand time: [1.9001 s 1.9056 s 1.9110 s] thrpt: [66.982 MiB/s 67.171 MiB/s 67.365 MiB/s] Benchmarking ingest-small-values/ingest 128MB/100b rand-1024keys: Warming up for 3.0000 s Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 11.0s. ingest-small-values/ingest 128MB/100b rand-1024keys time: [1.0715 s 1.0828 s 1.0937 s] thrpt: [117.04 MiB/s 118.21 MiB/s 119.46 MiB/s] ingest-small-values/ingest 128MB/100b seq, no delta time: [425.49 ms 429.07 ms 432.04 ms] thrpt: [296.27 MiB/s 298.32 MiB/s 300.83 MiB/s] Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) low mild ingest-big-values/ingest 128MB/8k seq time: [373.03 ms 375.84 ms 379.17 ms] thrpt: [337.58 MiB/s 340.57 MiB/s 343.13 MiB/s] Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) high mild ingest-big-values/ingest 128MB/8k seq, no delta time: [81.534 ms 82.811 ms 83.364 ms] thrpt: [1.4994 GiB/s 1.5095 GiB/s 1.5331 GiB/s] Found 1 outliers among 10 measurements (10.00%) ```	2024-08-06 16:39:40 +00:00
John Spray	3727c6fbbe	pageserver: use layer visibility when composing heatmap (#8616 ) ## Problem Sometimes, a layer is Covered by hasn't yet been evicted from local disk (e.g. shortly after image layer generation). It is not good use of resources to download these to a secondary location, as there's a good chance they will never be read. This follows the previous change that added layer visibility: - #8511 Part of epic: - https://github.com/neondatabase/neon/issues/8398 ## Summary of changes - When generating heatmaps, only include Visible layers - Update test_secondary_downloads to filter to visible layers when listing layers from an attached location	2024-08-06 17:15:40 +01:00
John Spray	42229aacf6	pageserver: fixes for layer visibility metric (#8603 ) ## Problem In staging, we could see that occasionally tenants were wrapping their pageserver_visible_physical_size metric past zero to 2^64. This is harmless right now, but will matter more later when we start using visible size in things like the /utilization endpoint. ## Summary of changes - Add debug asserts that detect this case. `test_gc_of_remote_layers` works as a reproducer for this issue once the asserts are added. - Tighten up the interface around access_stats so that only Layer can mutate it. - In Layer, wrap calls to `record_access` in code that will update the visible size statistic if the access implicitly marks the layer visible (this was what caused the bug) - In LayerManager::rewrite_layers, use the proper set_visibility layer function instead of directly using access_stats (this is an additional path where metrics could go bad.) - Removed unused instances of LayerAccessStats in DeltaLayer and ImageLayer which I noticed while reviewing the code paths that call record_access.	2024-08-06 14:47:01 +01:00
John Spray	b7beaa0fd7	tests: improve stability of `test_storage_controller_many_tenants` (#8607 ) ## Problem The controller scale test does random migrations. These mutate secondary locations, and therefore can cause secondary optimizations to happen in the background, violating the test's expectation that consistency_check will work as there are no reconciliations running. Example: https://neon-github-public-dev.s3.amazonaws.com/reports/main/10247161379/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/6316beacd3fb3060/ ## Summary of changes - Only migrate to existing secondary locations, not randomly picked nodes, so that we can do a fast reconcile_until_idle (otherwise reconcile_until_idle is takes a long time to create new secondary locations). - Do a reconcile_until_idle before consistency_check.	2024-08-06 12:58:33 +01:00
a-masterov	16c91ff5d3	enable rum test (#8380 ) ## Problem We need to test the rum extension automatically as a path of the GitHub workflow ## Summary of changes rum test is enabled	2024-08-06 13:56:42 +02:00
a-masterov	078f941dc8	Add a test using Debezium as a client for the logical replication (#8568 ) ## Problem We need to test the logical replication with some external consumers. ## Summary of changes A test of the logical replication with Debezium as a consumer was added. --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2024-08-06 13:08:55 +02:00
Arseny Sher	68bcbf8227	Add package-mode=false to poetry. We don't use it for packaging, and 'poetry install' will soon error otherwise. Also remove name and version fields as these are not required for non-packaging mode.	2024-08-06 13:53:23 +03:00
Arpad Müller	a31c95cb40	storage_scrubber: migrate scan_safekeeper_metadata to remote_storage (#8595 ) Migrates the safekeeper-specific parts of `ScanMetadata` to GenericRemoteStorage, making it Azure-ready. Part of https://github.com/neondatabase/neon/issues/7547	2024-08-06 10:51:39 +00:00
Joonas Koivunen	dc7eb5ae5a	chore: bump index part version (#8611 ) #8600 missed the hunk changing index_part.json informative version. Include it in this PR, in addition add more non-warning index_part.json versions to scrubber.	2024-08-06 11:45:41 +01:00
Vlad Lazar	44fedfd6c3	pageserver: remove legacy read path (#8601 ) ## Problem We have been maintaining two read paths (legacy and vectored) for a while now. The legacy read-path was only used for cross validation in some tests. ## Summary of changes * Tweak all tests that were using the legacy read path to use the vectored read path instead * Remove the read path dispatching based on the pageserver configs * Remove the legacy read path code We will be able to remove the single blob io code in `pageserver/src/tenant/blob_io.rs` when https://github.com/neondatabase/neon/issues/7386 is complete. Closes https://github.com/neondatabase/neon/issues/8005	2024-08-06 10:14:01 +01:00
Joonas Koivunen	138f008bab	feat: persistent gc blocking (#8600 ) Currently, we do not have facilities to persistently block GC on a tenant for whatever reason. We could do a tenant configuration update, but that is risky for generation numbers and would also be transient. Introduce a `gc_block` facility in the tenant, which manages per timeline blocking reasons. Additionally, add HTTP endpoints for enabling/disabling manual gc blocking for a specific timeline. For debugging, individual tenant status now includes a similar string representation logged when GC is skipped. Cc: #6994	2024-08-06 10:09:56 +01:00
Joonas Koivunen	6a6f30e378	fix: make Timeline::set_disk_consistent_lsn use fetch_max (#8311 ) now it is safe to use from multiple callers, as we have two callers.	2024-08-06 08:52:01 +01:00
Alex Chi Z.	8f3bc5ae35	feat(pageserver): support dry-run for gc-compaction, add statistics (#8557 ) Add dry-run mode that does not produce any image layer + delta layer. I will use this code to do some experiments and see how much space we can reclaim for tenants on staging. Part of https://github.com/neondatabase/neon/issues/8002 * Add dry-run mode that runs the full compaction process without updating the layer map. (We never call finish on the writers and the files will be removed before exiting the function). * Add compaction statistics and print them at the end of compaction. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-08-06 02:07:48 +00:00
Alexander Bayandin	e6e578821b	CI(benchmarking): set pub/sub projects for LR tests (#8483 ) ## Problem > Currently, long-running LR tests recreate endpoints every night. We'd like to have along-running buildup of history to exercise the pageserver in this case (instead of "unit-testing" the same behavior everynight). Closes #8317 ## Summary of changes - Update Postgres version for replication tests - Set `BENCHMARK_PROJECT_ID_PUB`/`BENCHMARK_PROJECT_ID_SUB` env vars to projects that were created for this purpose --------- Co-authored-by: Sasha Krassovsky <krassovskysasha@gmail.com>	2024-08-05 22:06:47 +00:00
Joonas Koivunen	c32807ac19	fix: allow awaiting logical size for root timelines (#8604 ) Currently if `GET /v1/tenant/x/timeline/y?force-await-initial-logical-size=true` is requested for a root timeline created within the current pageserver session, the request handler panics hitting the debug assertion. These timelines will always have an accurate (at initdb import) calculated logical size. Fix is to never attempt prioritizing timeline size calculation if we already have an exact value. Split off from #8528.	2024-08-05 21:21:33 +01:00
Alexander Bayandin	50daff9655	CI(trigger-e2e-tests): fix deadlock with Build and Test workflow (#8606 ) ## Problem In some cases, a deadlock between `build-and-test` and `trigger-e2e-tests` workflows can happen: ``` Build and Test Canceling since a deadlock for concurrency group 'Build and Test-8600/merge-anysha' was detected between 'top level workflow' and 'trigger-e2e-tests' ``` I don't understand the reason completely, probably `${{ github.workflow }}` got evaluated to the same value and somehow caused the issue. We don't need to limit concurrency for `trigger-e2e-tests` workflow. See https://neondb.slack.com/archives/C059ZC138NR/p1722869486708179?thread_ts=1722869027.960029&cid=C059ZC138NR	2024-08-05 19:47:59 +01:00