DNM debug

wip2
pageserver: start refactoring ingest metadata ops
2026-02-15 16:40:37 +00:00 · 2024-08-02 13:29:24 +01:00 · 2024-08-02 10:53:36 +01:00 · 2024-08-02 10:02:26 +01:00
136 changed files with 2363 additions and 4269 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,8 +8,6 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - BENCHMARK_PROJECT_ID_PUB
-  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,7 +147,7 @@ jobs:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -168,7 +168,7 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    - name: Run Logical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -176,15 +176,12 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
-        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}

-    - name: Run Physical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,31 +66,7 @@ jobs:
        ports:
          - 9000:9000
          - 8123:8123
-      zookeeper:
-        image: quay.io/debezium/zookeeper:2.7
-        ports:
-          - 2181:2181
-      kafka:
-        image: quay.io/debezium/kafka:2.7
-        env:
-          ZOOKEEPER_CONNECT: "zookeeper:2181"
-          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
-          KAFKA_BROKER_ID: 1
-          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-          KAFKA_JMX_PORT: 9991
-        ports:
-          - 9092:9092
-      debezium:
-        image: quay.io/debezium/connect:2.7
-        env:
-          BOOTSTRAP_SERVERS: kafka:9092
-          GROUP_ID: 1
-          CONFIG_STORAGE_TOPIC: debezium-config
-          OFFSET_STORAGE_TOPIC: debezium-offset
-          STATUS_STORAGE_TOPIC: debezium-status
-          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
-        ports:
-          - 8083:8083
+
    steps:
      - uses: actions/checkout@v4

--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -66,22 +66,8 @@ jobs:
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Azure login
-        if: steps.check-manifests.outputs.skip == 'false'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
        if: steps.check-manifests.outputs.skip == 'false'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -13,6 +13,8 @@ defaults:
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  cancel-previous-e2e-tests:
@@ -62,35 +64,19 @@ jobs:
    needs: [ tag ]
    runs-on: ubuntu-22.04
    env:
-      EVENT_ACTION: ${{ github.event.action }}
-      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
-      - name: Wait for `promote-images` job to finish
-        # It's important to have a timeout here, the script in the step can run infinitely
-        timeout-minutes: 60
+      - name: check if ecr image are present
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
-          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
-            exit 0
-          fi
-
-          # For PRs we use the run id as the tag
-          BUILD_AND_TEST_RUN_ID=${TAG}
-          while true; do
-            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
-            case "$conclusion" in
-              success)
-                break
-                ;;
-              failure | cancelled | skipped)
-                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
-                exit 1
-                ;;
-              *)
-                echo "The 'promote-images' hasn't succeed yet. Waiting..."
-                sleep 60
-                ;;
-            esac
+          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
+            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
+            if [ "$OUTPUT" == "" ]; then
+              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
+              exit 1
+            fi
          done

      - name: Set e2e-platforms
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4324,7 +4324,6 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
- "try-lock",
 "typed-json",
 "url",
 "urlencoding",
@@ -5704,7 +5703,6 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "r2d2",
- "rand 0.8.5",
 "reqwest 0.12.4",
 "routerify",
 "scopeguard",
@@ -6564,9 +6562,9 @@ dependencies = [

 [[package]]
 name = "try-lock"
-version = "0.2.5"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"

 [[package]]
 name = "tungstenite"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -184,7 +184,6 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
-try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
--- a/4
+++ b/4
@@ -33,8 +33,8 @@ ARG BUILD_TAG
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
 # cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
 ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX cachepot
+ENV AWS_REGION=eu-central-1
+ENV CACHEPOT_S3_KEY_PREFIX=cachepot
 ARG CACHEPOT_BUCKET=neon-github-dev
 #ARG AWS_ACCESS_KEY_ID
 #ARG AWS_SECRET_ACCESS_KEY
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -66,13 +66,13 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && rm -rf protoc.zip protoc

 # s5cmd
-ENV S5CMD_VERSION 2.2.2
+ENV S5CMD_VERSION=2.2.2
 RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
    && chmod +x s5cmd \
    && mv s5cmd /usr/local/bin/s5cmd

 # LLVM
-ENV LLVM_VERSION 18
+ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
@@ -125,8 +125,8 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
    && rm -rf ../lcov.tar.gz

 # Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION 1.1.1w
-ENV OPENSSL_PREFIX /usr/local/openssl
+ENV OPENSSL_VERSION=1.1.1w
+ENV OPENSSL_PREFIX=/usr/local/openssl
 RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
    cd /tmp && \
@@ -145,8 +145,8 @@ RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/sourc
 # TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
 # package, which is 67.1. We're duplicating that knowledge here, and also, technically,
 # Debian has a few patches on top of 67.1 that we're not adding here.
-ENV ICU_VERSION 67.1
-ENV ICU_PREFIX /usr/local/icu
+ENV ICU_VERSION=67.1
+ENV ICU_PREFIX=/usr/local/icu

 # Download and build static ICU
 RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
@@ -168,9 +168,9 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot

 # Python
-ENV PYTHON_VERSION 3.9.18
-ENV PYENV_ROOT /home/nonroot/.pyenv
-ENV PATH /home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
+ENV PYTHON_VERSION=3.9.18 \
+    PYENV_ROOT=/home/nonroot/.pyenv \
+    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
    && cd $HOME \
    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
@@ -192,9 +192,9 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION 1.80.0
-ENV RUSTUP_HOME "/home/nonroot/.rustup"
-ENV PATH "/home/nonroot/.cargo/bin:${PATH}"
+ENV RUSTC_VERSION=1.80.0
+ENV RUSTUP_HOME="/home/nonroot/.rustup"
+ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -211,7 +211,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER cachepot
+ENV RUSTC_WRAPPER=cachepot

 # Show versions
 RUN whoami \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -647,8 +647,8 @@ RUN apt-get update && \
    apt-get install -y curl libclang-dev cmake && \
    useradd -ms /bin/bash nonroot -b /home

-ENV HOME /home/nonroot
-ENV PATH "/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION
@@ -873,7 +873,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 ARG BUILD_TAG
-ENV BUILD_TAG $BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG

 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
@@ -933,8 +933,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -946,7 +945,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -961,18 +960,17 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
-ENV PATH /usr/local/pgsql/bin:$PATH
-ENV PGHOST compute
-ENV PGPORT 55433
-ENV PGUSER cloud_admin
-ENV PGDATABASE postgres
+ENV PATH=/usr/local/pgsql/bin:$PATH
+ENV PGHOST=compute
+ENV PGPORT=55433
+ENV PGUSER=cloud_admin
+ENV PGDATABASE=postgres
 #########################################################################################
 #
 # Final layer
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,11 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test specific features.
-testing = []
-
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,15 +400,7 @@ impl ComputeNode {
    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let mut retry_period_ms = 500.0;
        let mut attempts = 0;
-        const DEFAULT_ATTEMPTS: u16 = 10;
-        #[cfg(feature = "testing")]
-        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
-            u16::from_str(&v).unwrap()
-        } else {
-            DEFAULT_ATTEMPTS
-        };
-        #[cfg(not(feature = "testing"))]
-        let max_attempts = DEFAULT_ATTEMPTS;
+        let max_attempts = 10;
        loop {
            let result = self.try_get_basebackup(compute_state, lsn);
            match result {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {

 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_") {
+        if var.starts_with("NEON_PAGESERVER_") {
            cmd = cmd.env(var, val);
        }
    }
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
            cleanup
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x

-cd /ext-src || exit 2
+cd /ext-src
 FAILED=
-LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d "${d}" ] || continue
+       [ -d ${d} ] || continue
    psql -c "select 1" >/dev/null || break
-       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo "${FAILED}"
+echo ${FAILED}
 exit 1
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -107,10 +107,7 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(
-            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
-            "invalid key: {self}",
-        );
+        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,13 +637,6 @@ pub struct TenantInfo {
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
    pub generation: u32,
-
-    /// Opaque explanation if gc is being blocked.
-    ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub gc_blocking: Option<String>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -947,8 +940,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::path::PathBuf;
-
    #[derive(
        Copy,
        Clone,
@@ -967,53 +958,6 @@ pub mod virtual_file {
        #[cfg(target_os = "linux")]
        TokioEpollUring,
    }
-
-    /// Direct IO modes for a pageserver.
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-    pub enum DirectIoMode {
-        /// Direct IO disabled (uses usual buffered IO).
-        #[default]
-        Disabled,
-        /// Direct IO disabled (performs checks and perf simulations).
-        Evaluate {
-            /// Alignment check level
-            alignment_check: DirectIoAlignmentCheckLevel,
-            /// Latency padded for performance simulation.
-            latency_padding: DirectIoLatencyPadding,
-        },
-        /// Direct IO enabled.
-        Enabled {
-            /// Actions to perform on alignment error.
-            on_alignment_error: DirectIoOnAlignmentErrorAction,
-        },
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoAlignmentCheckLevel {
-        #[default]
-        Error,
-        Log,
-        None,
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoOnAlignmentErrorAction {
-        Error,
-        #[default]
-        FallbackToBuffered,
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "type", rename_all = "kebab-case")]
-    pub enum DirectIoLatencyPadding {
-        /// Pad virtual file operations with IO to a fake file.
-        FakeFileRW { path: PathBuf },
-        #[default]
-        None,
-    }
 }

 // Wrapped in libpq CopyData
@@ -1483,7 +1427,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1506,7 +1449,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,8 +1,6 @@
-use std::collections::HashSet;
-
 use utils::id::TimelineId;

 #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
-    pub reparented_timelines: HashSet<TimelineId>,
+    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -78,9 +78,8 @@ impl Drop for GateGuard {
    }
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(Debug)]
 pub enum GateError {
-    #[error("gate is closed")]
    GateClosed,
 }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -108,7 +108,3 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
-
-[[bench]]
-name = "bench_ingest"
-harness = false
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -1,239 +0,0 @@
-use std::{env, num::NonZeroUsize};
-
-use bytes::Bytes;
-use camino::Utf8PathBuf;
-use criterion::{criterion_group, criterion_main, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    context::{DownloadBehavior, RequestContext},
-    l0_flush::{L0FlushConfig, L0FlushGlobalState},
-    page_cache,
-    repository::Value,
-    task_mgr::TaskKind,
-    tenant::storage_layer::InMemoryLayer,
-    virtual_file,
-};
-use pageserver_api::{key::Key, shard::TenantShardId};
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-};
-
-// A very cheap hash for generating non-sequential keys.
-fn murmurhash32(mut h: u32) -> u32 {
-    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
-    h ^= h >> 13;
-    h = h.wrapping_mul(0xc2b2ae35);
-    h ^= h >> 16;
-    h
-}
-
-enum KeyLayout {
-    /// Sequential unique keys
-    Sequential,
-    /// Random unique keys
-    Random,
-    /// Random keys, but only use the bits from the mask of them
-    RandomReuse(u32),
-}
-
-enum WriteDelta {
-    Yes,
-    No,
-}
-
-async fn ingest(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) -> anyhow::Result<()> {
-    let mut lsn = utils::lsn::Lsn(1000);
-    let mut key = Key::from_i128(0x0);
-
-    let timeline_id = TimelineId::generate();
-    let tenant_id = TenantId::generate();
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
-
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-
-    let gate = utils::sync::gate::Gate::default();
-    let entered = gate.enter().unwrap();
-
-    let layer =
-        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
-
-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
-    let ctx = RequestContext::new(
-        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
-        pageserver::context::DownloadBehavior::Download,
-    );
-
-    for i in 0..put_count {
-        lsn += put_size as u64;
-
-        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
-        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
-        match key_layout {
-            KeyLayout::Sequential => {
-                // Use sequential order to illustrate the experience a user is likely to have
-                // when ingesting bulk data.
-                key.field6 = i as u32;
-            }
-            KeyLayout::Random => {
-                // Use random-order keys to avoid giving a false advantage to data structures that are
-                // faster when inserting on the end.
-                key.field6 = murmurhash32(i as u32);
-            }
-            KeyLayout::RandomReuse(mask) => {
-                // Use low bits only, to limit cardinality
-                key.field6 = murmurhash32(i as u32) & mask;
-            }
-        }
-
-        layer.put_value(key, lsn, &data, &ctx).await?;
-    }
-    layer.freeze(lsn + 1).await;
-
-    if matches!(write_delta, WriteDelta::Yes) {
-        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
-            max_concurrency: NonZeroUsize::new(1).unwrap(),
-        });
-        let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
-            .await?
-            .unwrap();
-        tokio::fs::remove_file(path).await?;
-    }
-
-    Ok(())
-}
-
-/// Wrapper to instantiate a tokio runtime
-fn ingest_main(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
-    runtime.block_on(async move {
-        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
-        if let Err(e) = r {
-            panic!("{e:?}");
-        }
-    });
-}
-
-/// Declare a series of benchmarks for the Pageserver's ingest write path.
-///
-/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
-/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
-///
-/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
-/// a fast disk, CPU is the bottleneck at time of writing.
-fn criterion_benchmark(c: &mut Criterion) {
-    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
-    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
-    eprintln!("Data directory: {}", temp_dir.path());
-
-    let conf: &'static PageServerConf = Box::leak(Box::new(
-        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
-    ));
-    virtual_file::init(16384, virtual_file::io_engine_for_bench());
-    page_cache::init(conf.page_cache_size);
-
-    {
-        let mut group = c.benchmark_group("ingest-small-values");
-        let put_size = 100usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/100b seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Random,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::RandomReuse(0x3ff),
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-
-    {
-        let mut group = c.benchmark_group("ingest-big-values");
-        let put_size = 8192usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/8k seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -123,7 +123,6 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
    info!(?conf.get_impl, "starting with get page implementation");
    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -300,9 +300,6 @@ pub struct PageServerConf {
    /// This flag is temporary and will be removed after gradual rollout.
    /// See <https://github.com/neondatabase/neon/issues/8184>.
    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
-
-    /// Direct IO settings
-    pub virtual_file_direct_io: virtual_file::DirectIoMode,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -411,8 +408,6 @@ struct PageServerConfigBuilder {
    l0_flush: BuilderValue<L0FlushConfig>,

    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
-
-    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
 }

 impl PageServerConfigBuilder {
@@ -503,7 +498,6 @@ impl PageServerConfigBuilder {
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
-            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
        }
    }
 }
@@ -691,10 +685,6 @@ impl PageServerConfigBuilder {
        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
    }

-    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
-        self.virtual_file_direct_io = BuilderValue::Set(value);
-    }
-
    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -753,7 +743,6 @@ impl PageServerConfigBuilder {
                ephemeral_bytes_per_memory_kb,
                l0_flush,
                compact_level0_phase1_value_access,
-                virtual_file_direct_io,
            }
            CUSTOM LOGIC
            {
@@ -1029,9 +1018,6 @@ impl PageServerConf {
                "compact_level0_phase1_value_access" => {
                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
                }
-                "virtual_file_direct_io" => {
-                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1117,7 +1103,6 @@ impl PageServerConf {
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
        }
    }
 }
@@ -1360,7 +1345,6 @@ background_task_maximum_delay = '334 s'
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1436,7 +1420,6 @@ background_task_maximum_delay = '334 s'
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,45 +308,6 @@ paths:
            application/json:
              schema:
                type: string
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently add a gc blocking at the tenant level because of this timeline
-      responses:
-        "200":
-          description: OK
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently remove a tenant level gc blocking for this timeline
-      responses:
-        "200":
-          description: OK
-
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -932,7 +893,7 @@ components:
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
    ArchivalConfigRequest:
      type: object
-      required:
+      required
        - state
      properties:
        state:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -935,7 +935,6 @@ async fn tenant_list_handler(
            generation: (*gen)
                .into()
                .expect("Tenants are always attached with a generation"),
-            gc_blocking: None,
        })
        .collect::<Vec<TenantInfo>>();

@@ -987,7 +986,6 @@ async fn tenant_status(
                    .generation()
                    .into()
                    .expect("Tenants are always attached with a generation"),
-                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1162,10 +1160,7 @@ async fn layer_map_info_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
-    let layer_map_info = timeline
-        .layer_map_info(reset)
-        .await
-        .map_err(|_shutdown| ApiError::ShuttingDown)?;
+    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
 }
@@ -1231,72 +1226,6 @@ async fn evict_timeline_layer_handler(
    }
 }

-async fn timeline_gc_blocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, true).await
-}
-
-async fn timeline_gc_unblocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, false).await
-}
-
-/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
-///
-/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
-async fn block_or_unblock_gc(
-    request: Request<Body>,
-    block: bool,
-) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::{
-        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
-    };
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let state = get_state(&request);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    let timeline = tenant.get_timeline(timeline_id, true)?;
-
-    let fut = async {
-        if block {
-            timeline.block_gc(&tenant).await.map(|_| ())
-        } else {
-            timeline.unblock_gc(&tenant).await
-        }
-    };
-
-    let span = tracing::info_span!(
-        "block_or_unblock_gc",
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %timeline_id,
-        block = block,
-    );
-
-    let res = fut.instrument(span).await;
-
-    res.map_err(|e| {
-        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
-            ApiError::ShuttingDown
-        } else {
-            ApiError::InternalServerError(e)
-        }
-    })?;
-
-    json_response(StatusCode::OK, ())
-}
-
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
    inputs: ModelInputs,
@@ -2975,14 +2904,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
-            |r| api_handler(r, timeline_gc_blocking_handler),
-        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
-            |r| api_handler(r, timeline_gc_unblocking_handler),
-        )
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -155,17 +155,9 @@ async fn import_rel(
    //
    // FIXME: Keep track of which relations we've already created?
    // https://github.com/neondatabase/neon/issues/3309
-    if let Err(e) = modification
+    modification
        .put_rel_creation(rel, nblocks as u32, ctx)
-        .await
-    {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
-            }
-            _ => return Err(e.into()),
-        }
-    }
+        .await?;

    loop {
        let r = reader.read_exact(&mut buf).await;
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

-pub enum Inner {
+pub(crate) enum Inner {
    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
        }
    }

-    pub fn inner(&self) -> &Arc<Inner> {
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
        &self.0
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -122,19 +122,16 @@ impl Listener {
    }
 }
 impl Connections {
-    pub(crate) async fn shutdown(self) {
+    pub async fn shutdown(self) {
        let Self { cancel, mut tasks } = self;
        cancel.cancel();
        while let Some(res) = tasks.join_next().await {
-            Self::handle_connection_completion(res);
-        }
-    }
-
-    fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
-        match res {
-            Ok(Ok(())) => {}
-            Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
-            Err(e) => error!("page_service connection task panicked: {:?}", e),
+            // the logging done here mimics what was formerly done by task_mgr
+            match res {
+                Ok(Ok(())) => {}
+                Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
+                Err(e) => error!("page_service connection task panicked: {:?}", e),
+            }
        }
    }
 }
@@ -158,19 +155,20 @@ pub async fn libpq_listener_main(
    let connections_cancel = CancellationToken::new();
    let mut connection_handler_tasks = tokio::task::JoinSet::default();

-    loop {
-        let accepted = tokio::select! {
-            biased;
-            _ = listener_cancel.cancelled() => break,
-            next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
-                let res = next.expect("we dont poll while empty");
-                Connections::handle_connection_completion(res);
-                continue;
-            }
-            accepted = listener.accept() => accepted,
-        };
+    // Wait for a new connection to arrive, or for server shutdown.
+    while let Some(res) = tokio::select! {
+        biased;

-        match accepted {
+        _ = listener_cancel.cancelled() => {
+            // We were requested to shut down.
+            None
+        }
+
+        res = listener.accept() => {
+            Some(res)
+        }
+    } {
+        match res {
            Ok((socket, peer_addr)) => {
                // Connection established. Spawn a new task to handle it.
                debug!("accepted connection from {}", peer_addr);
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -174,6 +174,7 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
+            metadata_state: MetadataWriteState::new(self),
            lsn,
        }
    }
@@ -1034,6 +1035,229 @@ impl Timeline {
    }
 }

+/// Write something other than a simple postgres key/value: unlike regular relation page writes, these
+/// require access to a Timeline in order to do read-modify-write.
+#[derive(Debug)]
+enum MetadataOp {
+    // - Insert to DBDIR_KEY if this (spcnode, dbnode) does not already exist
+    // - Insert to rel_dir_to_key(spcnode, dbnode)
+    UpsertRelDirectory { spcnode: Oid, dbnode: Oid },
+    UpsertRelDirectory2 { rel: RelTag, nblocks: BlockNumber },
+    // - Insert this xid to TWOPHASEDIR_KEY
+    // UpdateTwoPhaseDir{
+    //     xid: TransactionId
+    // },
+    // // - Drop this (spcnode, dbnode) from DBDIR_KEY
+    // DropDbDir {
+    //     spcnode: Oid,
+    //     dbnode: Oid
+    // },
+    // // - Drop this (spcnode, dbnode) from DBDIR_KEY
+    // DropRel {
+    //     rel: RelTag,
+    // },
+    // // - Read-subtract-write the relation size for this rel
+    // RelTruncate {
+    //     rel: RelTag,
+    //     nblocks: BlockNumber
+    // },
+    // // - Read-add-write the relation size for this rel
+    // RelExtend {
+    //     rel: RelTag,
+    //     nblocks: BlockNumber
+    // },
+    // // - Read-modify-write of `slru_dir_to_key` and `slru_segment_size_to_key`
+    // CreateSlruSegment {
+    //     kind: SlruKind,
+    //     segno: u32,
+    //     nblocks: BlockNumber,
+    // }
+}
+
+/// State that spans all the apply() calls of all the MetadataOp in a DatadirMotification
+struct MetadataWriteState<'a> {
+    /// The timeline this modification applies to. You can access this to
+    /// read the state, but note that any pending updates are *not* reflected
+    /// in the state in 'tline' yet.
+    pub tline: &'a Timeline,
+
+    // Write-through cache.
+    // For pages that we read-modify-write, stash the last value here after each MetadataOp,
+    // so that we don't have to enter Timeline::get more than necessary.
+    last_write: HashMap<Key, Bytes>,
+
+    /// For special "directory" keys that store key-value maps, track the size of the map
+    /// if it was updated in this modification.
+    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+
+    // Debug assertions: for calls that we expect to always come in LSN order, track the last LSN we saw
+    #[cfg(debug_assertions)]
+    debug_last_lsn: Lsn,
+}
+
+impl<'a> MetadataWriteState<'a> {
+    fn new(timeline: &'a Timeline) -> Self {
+        if cfg!(debug_assertions) {
+            Self {
+                tline: timeline,
+                last_write: HashMap::default(),
+                pending_directory_entries: Vec::default(),
+                debug_last_lsn: Lsn(0),
+            }
+        } else {
+            Self {
+                tline: timeline,
+                last_write: HashMap::default(),
+                pending_directory_entries: Vec::default(),
+                debug_last_lsn: Lsn(0),
+            }
+        }
+    }
+    fn assert_lsn_order(&mut self, lsn: Lsn) {
+        #[cfg(debug_assertions)]
+        {
+            debug_assert!(lsn >= self.debug_last_lsn);
+            self.debug_last_lsn = lsn;
+        }
+    }
+
+    async fn get(
+        &mut self,
+        lsn: Lsn,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.assert_lsn_order(lsn);
+
+        match self.last_write.get(&key) {
+            Some(v) => Ok(v.clone()),
+            None => self.tline.get(key, lsn, ctx).await,
+        }
+    }
+
+    /// Observe a page write
+    fn put(&mut self, lsn: Lsn, key: Key, value: Bytes) {
+        self.assert_lsn_order(lsn);
+
+        self.last_write.insert(key, value);
+    }
+}
+
+impl MetadataOp {
+    async fn apply<'a>(
+        self,
+        lsn: Lsn,
+        data_dir_mod: &mut DatadirModification<'a>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        eprintln!("MetadataOp::apply: {self:?}");
+        match self {
+            Self::UpsertRelDirectory { spcnode, dbnode } => {
+                // Add it to the directory (if it doesn't exist already)
+                let buf = data_dir_mod.metadata_state.get(lsn, DBDIR_KEY, ctx).await?;
+                let mut dbdir = DbDirectory::des(&buf)?;
+
+                let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
+                if r.is_none() || r == Some(false) {
+                    // The dbdir entry didn't exist, or it contained a
+                    // 'false'. The 'insert' call already updated it with
+                    // 'true', now write the updated 'dbdirs' map back.
+                    let buf = DbDirectory::ser(&dbdir)?;
+                    data_dir_mod.put_metadata_page(lsn, DBDIR_KEY, Bytes::from(buf));
+                }
+                if r.is_none() {
+                    // Create RelDirectory
+                    let buf = RelDirectory::ser(&RelDirectory {
+                        rels: HashSet::new(),
+                    })?;
+                    data_dir_mod
+                        .metadata_state
+                        .pending_directory_entries
+                        .push((DirectoryKind::Rel, 0));
+                    data_dir_mod.put_metadata_page(
+                        lsn,
+                        rel_dir_to_key(spcnode, dbnode),
+                        Bytes::from(buf),
+                    );
+                }
+
+                Ok(())
+            }
+
+            Self::UpsertRelDirectory2 { rel, nblocks } => {
+                // It's possible that this is the first rel for this db in this
+                // tablespace.  Create the reldir entry for it if so.
+                let mut dbdir = DbDirectory::des(
+                    &data_dir_mod
+                        .metadata_state
+                        .get(lsn, DBDIR_KEY, ctx)
+                        .await
+                        .context("read db")?,
+                )
+                .context("deserialize db")?;
+                let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
+                let mut rel_dir = if let hash_map::Entry::Vacant(e) =
+                    dbdir.dbdirs.entry((rel.spcnode, rel.dbnode))
+                {
+                    // Didn't exist. Update dbdir
+                    e.insert(false);
+                    let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                    data_dir_mod
+                        .metadata_state
+                        .pending_directory_entries
+                        .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                    data_dir_mod.put_metadata_page(lsn, DBDIR_KEY, buf.into());
+
+                    // and create the RelDirectory
+                    RelDirectory::default()
+                } else {
+                    // reldir already exists, fetch it
+                    RelDirectory::des(
+                        &data_dir_mod
+                            .metadata_state
+                            .get(lsn, rel_dir_key, ctx)
+                            .await
+                            .context("read db")?,
+                    )
+                    .context("deserialize db")?
+                };
+
+                // Add the new relation to the rel directory entry, and write it back
+                if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
+                    // Drop out early if the relation already existed
+                    return Ok(());
+                }
+
+                data_dir_mod
+                    .metadata_state
+                    .pending_directory_entries
+                    .push((DirectoryKind::Rel, rel_dir.rels.len()));
+
+                data_dir_mod.put_metadata_page(
+                    lsn,
+                    rel_dir_key,
+                    Bytes::from(RelDirectory::ser(&rel_dir).context("serialize")?),
+                );
+
+                // Put size
+                let size_key = rel_size_to_key(rel);
+                let buf = nblocks.to_le_bytes();
+                data_dir_mod.put_metadata_page(lsn, size_key, Bytes::from(buf.to_vec()));
+
+                data_dir_mod.pending_nblocks += nblocks as i64;
+
+                // Update relation size cache
+                data_dir_mod
+                    .metadata_state
+                    .tline
+                    .set_cached_rel_size(rel, lsn, nblocks);
+
+                Ok(())
+            }
+        }
+    }
+}
+
 /// DatadirModification represents an operation to ingest an atomic set of
 /// updates to the repository. It is created by the 'begin_record'
 /// function. It is called for each WAL record, so that all the modifications
@@ -1055,6 +1279,8 @@ pub struct DatadirModification<'a> {
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

+    metadata_state: MetadataWriteState<'a>,
+
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1081,6 +1307,26 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    /// Apply a complex write op that may require read-modify-write to the underlying Timeline.
+    async fn put_metadata_op(
+        &mut self,
+        lsn: Lsn,
+        meta: MetadataOp,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Placeholder: just run inline.
+        // TODO: make this sync, and defer all these to commit(), so that we don't have to carry a Timeline all the time.
+        meta.apply(lsn, self, ctx).await
+    }
+
+    /// While applying a metadata op, write a materialized page.
+    fn put_metadata_page(&mut self, lsn: Lsn, key: Key, value: Bytes) {
+        eprintln!("put_metadata_page {key} @ {lsn}");
+        self.put_at_lsn(lsn, key, Value::Image(value.clone()));
+
+        self.metadata_state.put(lsn, key, value);
+    }
+
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -1092,9 +1338,6 @@ impl<'a> DatadirModification<'a> {
        self.pending_directory_entries.push((DirectoryKind::Db, 0));
        self.put(DBDIR_KEY, Value::Image(buf.into()));

-        // Create AuxFilesDirectory
-        self.init_aux_dir()?;
-
        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
        })?;
@@ -1196,33 +1439,12 @@ impl<'a> DatadirModification<'a> {
        img: Bytes,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY, ctx).await?;
-        let mut dbdir = DbDirectory::des(&buf)?;
-
-        let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
-        if r.is_none() || r == Some(false) {
-            // The dbdir entry didn't exist, or it contained a
-            // 'false'. The 'insert' call already updated it with
-            // 'true', now write the updated 'dbdirs' map back.
-            let buf = DbDirectory::ser(&dbdir)?;
-            self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-            // Create AuxFilesDirectory as well
-            self.init_aux_dir()?;
-        }
-        if r.is_none() {
-            // Create RelDirectory
-            let buf = RelDirectory::ser(&RelDirectory {
-                rels: HashSet::new(),
-            })?;
-            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
-            self.put(
-                rel_dir_to_key(spcnode, dbnode),
-                Value::Image(Bytes::from(buf)),
-            );
-        }
-
+        self.put_metadata_op(
+            self.lsn,
+            MetadataOp::UpsertRelDirectory { spcnode, dbnode },
+            ctx,
+        )
+        .await?;
        self.put(relmap_file_key(spcnode, dbnode), Value::Image(img));
        Ok(())
    }
@@ -1316,56 +1538,15 @@ impl<'a> DatadirModification<'a> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
-        if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
-        }
-        // It's possible that this is the first rel for this db in this
-        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
-        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir =
-            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
-                // Didn't exist. Update dbdir
-                e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-                self.pending_directory_entries
-                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
-                self.put(DBDIR_KEY, Value::Image(buf.into()));
+    ) -> anyhow::Result<()> {
+        // TODO: here, or earlier, validate that rel.relnode != 0 -- perhaps on construction of the RelTag?

-                // and create the RelDirectory
-                RelDirectory::default()
-            } else {
-                // reldir already exists, fetch it
-                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                    .context("deserialize db")?
-            };
-
-        // Add the new relation to the rel directory entry, and write it back
-        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            return Err(RelationError::AlreadyExists);
-        }
-
-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, rel_dir.rels.len()));
-
-        self.put(
-            rel_dir_key,
-            Value::Image(Bytes::from(
-                RelDirectory::ser(&rel_dir).context("serialize")?,
-            )),
-        );
-
-        // Put size
-        let size_key = rel_size_to_key(rel);
-        let buf = nblocks.to_le_bytes();
-        self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
-
-        self.pending_nblocks += nblocks as i64;
-
-        // Update relation size cache
-        self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
+        self.put_metadata_op(
+            self.lsn,
+            MetadataOp::UpsertRelDirectory2 { rel, nblocks },
+            ctx,
+        )
+        .await?;

        // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the
        // caller.
@@ -1570,19 +1751,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
-        if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
-            return Ok(());
-        }
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, 0));
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-        Ok(())
-    }
-
    pub async fn put_file(
        &mut self,
        path: &str,
@@ -1877,6 +2045,7 @@ impl<'a> DatadirModification<'a> {

    // Internal helper functions to batch the modifications

+    // TODO: retire this once all metadata writes are going via MetadataWriteState
    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
        // Have we already updated the same key? Read the latest pending updated
        // version in that case.
@@ -1910,15 +2079,19 @@ impl<'a> DatadirModification<'a> {
    }

    fn put(&mut self, key: Key, val: Value) {
+        self.put_at_lsn(self.lsn, key, val)
+    }
+
+    fn put_at_lsn(&mut self, lsn: Lsn, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
        if let Some((last_lsn, last_value)) = values.last_mut() {
-            if *last_lsn == self.lsn {
+            if *last_lsn == lsn {
                *last_value = val;
                return;
            }
        }
-        values.push((self.lsn, val));
+        values.push((lsn, val));
    }

    fn delete(&mut self, key_range: Range<Key>) {
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,6 +56,7 @@ impl Statvfs {
 }

 pub mod mock {
+    use anyhow::Context;
    use camino::Utf8Path;
    use regex::Regex;
    use tracing::log::info;
@@ -134,30 +135,14 @@ pub mod mock {
            {
                continue;
            }
-            let m = match entry.metadata() {
-                Ok(m) => m,
-                Err(e) if is_not_found(&e) => {
-                    // some temp file which got removed right as we are walking
-                    continue;
-                }
-                Err(e) => {
-                    return Err(anyhow::Error::new(e)
-                        .context(format!("get metadata of {:?}", entry.path())))
-                }
-            };
-            total += m.len();
+            total += entry
+                .metadata()
+                .with_context(|| format!("get metadata of {:?}", entry.path()))?
+                .len();
        }
        Ok(total)
    }

-    fn is_not_found(e: &walkdir::Error) -> bool {
-        let Some(io_error) = e.io_error() else {
-            return false;
-        };
-        let kind = io_error.kind();
-        matches!(kind, std::io::ErrorKind::NotFound)
-    }
-
    pub struct Statvfs {
        pub blocks: u64,
        pub blocks_available: u64,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -148,7 +148,6 @@ pub(crate) mod timeline;

 pub mod size;

-mod gc_block;
 pub(crate) mod throttle;

 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -304,12 +303,6 @@ pub struct Tenant {
    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

-    /// `index_part.json` based gc blocking reason tracking.
-    ///
-    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
-    /// proceeding.
-    pub(crate) gc_block: gc_block::GcBlock,
-
    l0_flush_global_state: L0FlushGlobalState,
 }

@@ -601,12 +594,6 @@ impl From<PageReconstructError> for GcError {
    }
 }

-impl From<timeline::layer_manager::Shutdown> for GcError {
-    fn from(_: timeline::layer_manager::Shutdown) -> Self {
-        GcError::TimelineCancelled
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum LoadConfigError {
    #[error("TOML deserialization error: '{0}'")]
@@ -716,7 +703,6 @@ impl Tenant {
                    .read()
                    .await
                    .layer_map()
-                    .expect("currently loading, layer manager cannot be shutdown already")
                    .iter_historic_layers()
                    .next()
                    .is_some(),
@@ -1050,8 +1036,6 @@ impl Tenant {
            }
        }

-        let mut gc_blocks = HashMap::new();
-
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
@@ -1061,16 +1045,6 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

-            if let Some(blocking) = index_part.gc_blocking.as_ref() {
-                // could just filter these away, but it helps while testing
-                anyhow::ensure!(
-                    !blocking.reasons.is_empty(),
-                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
-                );
-                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
-                assert!(prev.is_none());
-            }
-
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
@@ -1115,8 +1089,6 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        self.gc_block.set_scanned(gc_blocks);
-
        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
@@ -1707,14 +1679,6 @@ impl Tenant {
            }
        }

-        let _guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(reasons) => {
-                info!("Skipping GC: {reasons}");
-                return Ok(GcResult::default());
-            }
-        };
-
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }
@@ -2727,7 +2691,6 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
-            gc_block: Default::default(),
            l0_flush_global_state,
        }
    }
@@ -3012,6 +2975,54 @@ impl Tenant {
        // because that will stall branch creation.
        let gc_cs = self.gc_cs.lock().await;

+        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
+        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
+        // and fail out if it's inaccurate.
+        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
+        {
+            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
+                BTreeMap::new();
+            timelines.iter().for_each(|timeline| {
+                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
+                    let ancestor_children =
+                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
+                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
+                }
+            });
+
+            for timeline in &timelines {
+                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
+                    .remove(&timeline.timeline_id)
+                    .unwrap_or_default();
+
+                branchpoints.sort_by_key(|b| b.0);
+
+                let target = timeline.gc_info.read().unwrap();
+
+                // We require that retain_lsns contains everything in `branchpoints`, but not that
+                // they are exactly equal: timeline deletions can race with us, so retain_lsns
+                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
+                // just means that we retain slightly more data than we otherwise might.
+                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
+                for b in &branchpoints {
+                    if !have_branchpoints.contains(b) {
+                        tracing::error!(
+                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
+                            branchpoints,
+                            target.retain_lsns
+                        );
+                        debug_assert!(false);
+                        // Do not GC based on bad information!
+                        // (ab-use an existing GcError type rather than adding a new one, since this is a
+                        // "should never happen" check that will be removed soon).
+                        return Err(GcError::Remote(anyhow::anyhow!(
+                            "retain_lsns failed validation!"
+                        )));
+                    }
+                }
+            }
+        }
+
        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
        let mut gc_timelines = Vec::with_capacity(timelines.len());
@@ -4081,7 +4092,7 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
-    use std::collections::{BTreeMap, BTreeSet};
+    use std::collections::BTreeMap;

    use super::*;
    use crate::keyspace::KeySpaceAccum;
@@ -4633,10 +4644,10 @@ mod tests {

        let layer_map = tline.layers.read().await;
        let level0_deltas = layer_map
-            .layer_map()?
-            .level0_deltas()
-            .iter()
-            .map(|desc| layer_map.get_from_desc(desc))
+            .layer_map()
+            .get_level0_deltas()
+            .into_iter()
+            .map(|desc| layer_map.get_from_desc(&desc))
            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());
@@ -4756,7 +4767,7 @@ mod tests {
        lsn: Lsn,
        repeat: usize,
        key_count: usize,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+    ) -> anyhow::Result<()> {
        let compact = true;
        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
    }
@@ -4769,9 +4780,7 @@ mod tests {
        repeat: usize,
        key_count: usize,
        compact: bool,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
-        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
-
+    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;

@@ -4792,7 +4801,6 @@ mod tests {
                        ctx,
                    )
                    .await?;
-                inserted.entry(test_key).or_default().insert(lsn);
                writer.finish_write(lsn);
                drop(writer);

@@ -4817,7 +4825,7 @@ mod tests {
            assert_eq!(res.layers_removed, 0, "this never removes anything");
        }

-        Ok(inserted)
+        Ok(())
    }

    //
@@ -4864,16 +4872,14 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
-        let lm = guard.layer_map()?;
-
-        lm.dump(true, &ctx).await?;
+        guard.layer_map().dump(true, &ctx).await?;

        let mut reads = Vec::new();
        let mut prev = None;
-        lm.iter_historic_layers().for_each(|desc| {
+        guard.layer_map().iter_historic_layers().for_each(|desc| {
            if !desc.is_delta() {
                prev = Some(desc.clone());
                return;
@@ -4927,39 +4933,9 @@ mod tests {
                    &ctx,
                )
                .await;
-
-            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
-            let mut expect_missing = false;
-            let mut key = read.start().unwrap();
-            while key != read.end().unwrap() {
-                if let Some(lsns) = inserted.get(&key) {
-                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
-                    match expected_lsn {
-                        Some(lsn) => {
-                            expected_lsns.insert(key, *lsn);
-                        }
-                        None => {
-                            expect_missing = true;
-                            break;
-                        }
-                    }
-                } else {
-                    expect_missing = true;
-                    break;
-                }
-
-                key = key.next();
-            }
-
-            if expect_missing {
-                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
-            } else {
-                for (key, image) in vectored_res? {
-                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
-                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
-                    assert_eq!(image?, expected_image);
-                }
-            }
+            tline
+                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
+                .await;
        }

        Ok(())
@@ -5009,6 +4985,10 @@ mod tests {
            )
            .await;

+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
        let images = vectored_res?;
        assert!(images.is_empty());
        Ok(())
@@ -5879,12 +5859,23 @@ mod tests {
            tline.freeze_and_flush().await?; // force create a delta layer
        }

-        let before_num_l0_delta_files =
-            tline.layers.read().await.layer_map()?.level0_deltas().len();
+        let before_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()
+            .len();

        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;

-        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
+        let after_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()
+            .len();

        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");

@@ -6908,10 +6899,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
@@ -7005,10 +6993,7 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        Ok(())
    }
@@ -7342,10 +7327,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for idx in 0..10 {
            assert_eq!(
@@ -7371,10 +7353,7 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        Ok(())
    }
@@ -7919,28 +7898,11 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        let mut dryrun_flags = EnumSet::new();
-        dryrun_flags.insert(CompactFlags::DryRun);
-
-        tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
-            .await
-            .unwrap();
-        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
-        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
-        verify_result().await;
-
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;

        // compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;

        // increase GC horizon and compact again
@@ -7950,17 +7912,11 @@ mod tests {
            guard.cutoffs.time = Lsn(0x38);
            guard.cutoffs.space = Lsn(0x38);
        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result

        // not increasing the GC horizon and compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;

        Ok(())
@@ -8141,10 +8097,7 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        branch_tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        verify_result().await;

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -29,7 +29,6 @@ impl EphemeralFile {
        conf: &PageServerConf,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<EphemeralFile, io::Error> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -52,12 +51,10 @@ impl EphemeralFile {
        )
        .await?;

-        let prewarm = conf.l0_flush.prewarm_on_write();
-
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
        })
    }

@@ -164,11 +161,7 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;

-        let gate = utils::sync::gate::Gate::default();
-
-        let entered = gate.enter().unwrap();
-
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;

        let pos_foo = file.write_blob(b"foo", &ctx).await?;
        assert_eq!(
@@ -222,38 +215,4 @@ mod tests {

        Ok(())
    }
-
-    #[tokio::test]
-    async fn ephemeral_file_holds_gate_open() {
-        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
-
-        let (conf, tenant_id, timeline_id, ctx) =
-            harness("ephemeral_file_holds_gate_open").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-            .await
-            .unwrap();
-
-        let mut closing = tokio::task::spawn(async move {
-            gate.close().await;
-        });
-
-        // gate is entered until the ephemeral file is dropped
-        // do not start paused tokio-epoll-uring has a sleep loop
-        tokio::time::pause();
-        tokio::time::timeout(FOREVER, &mut closing)
-            .await
-            .expect_err("closing cannot complete before dropping");
-
-        // this is a requirement of the reset_tenant functionality: we have to be able to restart a
-        // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
-        drop(file);
-
-        tokio::time::timeout(FOREVER, &mut closing)
-            .await
-            .expect("closing completes right away")
-            .expect("closing does not panic");
-    }
 }
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -18,8 +18,6 @@ use super::zero_padded_read_write;
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
-    _gate_guard: utils::sync::gate::GateGuard,
 }

 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -31,11 +29,7 @@ pub enum PrewarmOnWrite {
 }

 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
@@ -44,7 +38,6 @@ impl RW {
                file,
                prewarm_on_write,
            )),
-            _gate_guard,
        }
    }

@@ -152,7 +145,6 @@ impl Drop for RW {
        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.

        // unlink the file
-        // we are clear to do this, because we have entered a gate
        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,213 +0,0 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
-
-use super::remote_timeline_client::index::GcBlockingReason;
-
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
-
-#[derive(Default)]
-pub(crate) struct GcBlock {
-    /// The timelines which have current reasons to block gc.
-    ///
-    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
-    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
-    reasons: std::sync::Mutex<Storage>,
-    blocking: tokio::sync::Mutex<()>,
-}
-
-impl GcBlock {
-    /// Start another gc iteration.
-    ///
-    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
-    /// it's ending, or if not currently possible, a value describing the reasons why not.
-    ///
-    /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
-        let reasons = {
-            let g = self.reasons.lock().unwrap();
-
-            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
-            // tests, we use everything. we should warn if the gc has been consecutively blocked
-            // for more than 1h (within single tenant session?).
-            BlockingReasons::clean_and_summarize(g)
-        };
-
-        if let Some(reasons) = reasons {
-            Err(reasons)
-        } else {
-            Ok(Guard {
-                _inner: self.blocking.lock().await,
-            })
-        }
-    }
-
-    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
-        let g = self.reasons.lock().unwrap();
-
-        BlockingReasons::summarize(&g)
-    }
-
-    /// Start blocking gc for this one timeline for the given reason.
-    ///
-    /// This is not a guard based API but instead it mimics set API. The returned future will not
-    /// resolve until an existing gc round has completed.
-    ///
-    /// Returns true if this block was new, false if gc was already blocked for this reason.
-    ///
-    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
-    /// keep the gc blocking reason.
-    pub(crate) async fn insert(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<bool> {
-        let (added, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
-            let added = set.insert(reason);
-
-            // LOCK ORDER: intentionally hold the lock, see self.reasons.
-            let uploaded = timeline
-                .remote_client
-                .schedule_insert_gc_block_reason(reason)?;
-
-            (added, uploaded)
-        };
-
-        uploaded.await?;
-
-        // ensure that any ongoing gc iteration has completed
-        drop(self.blocking.lock().await);
-
-        Ok(added)
-    }
-
-    /// Remove blocking gc for this one timeline and the given reason.
-    pub(crate) async fn remove(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<()> {
-        use std::collections::hash_map::Entry;
-
-        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
-
-        let (remaining_blocks, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
-                Entry::Occupied(mut oe) => {
-                    let set = oe.get_mut();
-                    set.remove(reason);
-                    if set.is_empty() {
-                        oe.remove();
-                    }
-                }
-                Entry::Vacant(_) => {
-                    // we must still do the index_part.json update regardless, in case we had earlier
-                    // been cancelled
-                }
-            }
-
-            let remaining_blocks = g.len();
-
-            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
-            let uploaded = timeline
-                .remote_client
-                .schedule_remove_gc_block_reason(reason)?;
-
-            (remaining_blocks, uploaded)
-        };
-        uploaded.await?;
-
-        // no need to synchronize with gc iteration again
-
-        if remaining_blocks > 0 {
-            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
-        } else {
-            tracing::info!("gc is now unblocked for the tenant");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
-        let unblocked = {
-            let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
-                return;
-            }
-
-            g.remove(&timeline.timeline_id);
-
-            BlockingReasons::clean_and_summarize(g).is_none()
-        };
-
-        if unblocked {
-            tracing::info!("gc is now unblocked following deletion");
-        }
-    }
-
-    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
-        let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
-
-        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
-            tracing::info!(summary=?reasons, "initialized with gc blocked");
-        }
-    }
-}
-
-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
-}
-
-#[derive(Debug)]
-pub(crate) struct BlockingReasons {
-    timelines: usize,
-    reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-impl std::fmt::Display for BlockingReasons {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
-        )
-    }
-}
-
-impl BlockingReasons {
-    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
-            reasons = reasons.union(*value);
-            !value.is_empty()
-        });
-        if !g.is_empty() {
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        } else {
-            None
-        }
-    }
-
-    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
-            None
-        } else {
-            let reasons = g
-                .values()
-                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -846,8 +846,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
-        &self.l0_delta_layers
+    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
+        self.l0_delta_layers.to_vec()
    }

    /// debugging function to print out the contents of the layer map
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -224,8 +224,21 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }

 /// See [`Self::spawn`].
-#[derive(Clone, Default)]
-pub struct BackgroundPurges(tokio_util::task::TaskTracker);
+#[derive(Clone)]
+pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
+enum BackgroundPurgesInner {
+    Open(tokio::task::JoinSet<()>),
+    // we use the async mutex for coalescing
+    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
+}
+
+impl Default for BackgroundPurges {
+    fn default() -> Self {
+        Self(Arc::new(std::sync::Mutex::new(
+            BackgroundPurgesInner::Open(JoinSet::new()),
+        )))
+    }
+}

 impl BackgroundPurges {
    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -234,32 +247,24 @@ impl BackgroundPurges {
    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
-        //
-        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
-        // right after, but at least the shutdown will wait for what we are spawning next.
-        let token = self.0.token();
-
-        if self.0.is_closed() {
-            warn!(
-                %tmp_path,
-                "trying to spawn background purge during shutdown, ignoring"
-            );
-            return;
-        }
-
-        let span = info_span!(parent: None, "background_purge", %tmp_path);
-
-        let task = move || {
-            let _token = token;
-            let _entered = span.entered();
-            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
-                // should we fatal_io_error here?
-                warn!(%error, "failed to purge tenant directory");
+        let mut guard = self.0.lock().unwrap();
+        let jset = match &mut *guard {
+            BackgroundPurgesInner::Open(ref mut jset) => jset,
+            BackgroundPurgesInner::ShuttingDown(_) => {
+                warn!("trying to spawn background purge during shutdown, ignoring");
+                return;
            }
        };
-
-        BACKGROUND_RUNTIME.spawn_blocking(task);
+        jset.spawn_on(
+            async move {
+                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
+                    // should we fatal_io_error here?
+                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
+                }
+            }
+            .instrument(info_span!(parent: None, "background_purge")),
+            BACKGROUND_RUNTIME.handle(),
+        );
    }

    /// When this future completes, all background purges have completed.
@@ -273,9 +278,42 @@ impl BackgroundPurges {
    /// instances of this future will continue to be correct.
    #[instrument(skip_all)]
    pub async fn shutdown(&self) {
-        // forbid new tasks (can be called many times)
-        self.0.close();
-        self.0.wait().await;
+        let jset = {
+            let mut guard = self.0.lock().unwrap();
+            match &mut *guard {
+                BackgroundPurgesInner::Open(jset) => {
+                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
+                        std::mem::take(jset),
+                    )))
+                }
+                BackgroundPurgesInner::ShuttingDown(_) => {
+                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
+                    warn!("already shutting down");
+                }
+            };
+            match &mut *guard {
+                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
+                BackgroundPurgesInner::Open(_) => {
+                    unreachable!("above code transitions into shut down state");
+                }
+            }
+        };
+        let mut jset = jset.lock().await; // concurrent callers coalesce here
+        while let Some(res) = jset.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(e) if e.is_panic() => {
+                    // If it panicked, the error is already logged by the panic hook.
+                }
+                Err(e) if e.is_cancelled() => {
+                    unreachable!("we don't cancel the joinset or runtime")
+                }
+                Err(e) => {
+                    // No idea when this can happen, but let's log it.
+                    warn!(%e, "background purge task failed or panicked");
+                }
+            }
+        }
    }
 }

@@ -1729,9 +1767,14 @@ impl TenantManager {
            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
            for timeline in timelines.values() {
                tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let layers = timeline.layers.read().await;
+                let timeline_layers = timeline
+                    .layers
+                    .read()
+                    .await
+                    .likely_resident_layers()
+                    .collect::<Vec<_>>();

-                for layer in layers.likely_resident_layers() {
+                for layer in timeline_layers {
                    let relative_path = layer
                        .local_path()
                        .strip_prefix(&parent_path)
@@ -1928,8 +1971,7 @@ impl TenantManager {
        timeline_id: TimelineId,
        prepared: PreparedTimelineDetach,
        ctx: &RequestContext,
-    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        // FIXME: this is unnecessary, slotguard already has these semantics
+    ) -> Result<Vec<TimelineId>, anyhow::Error> {
        struct RevertOnDropSlot(Option<SlotGuard>);

        impl Drop for RevertOnDropSlot {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,123 +800,6 @@ impl RemoteTimelineClient {
            .context("wait completion")
    }

-    /// Adds a gc blocking reason for this timeline if one does not exist already.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_insert_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
-                    drop(guard);
-                    panic!("cannot start detach ancestor if there is nothing to detach from");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                // Usual case: !wanted(x) && !wanted(y)
-                //
-                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
-                // turn on and off some reason.
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        // this could be avoided by having external in-memory synchronization, like
-                        // timeline detach ancestor
-                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
-                    }
-
-                    // at this point, the metadata must always show that there is a parent
-                    upload_queue.dirty.gc_blocking = current
-                        .map(|x| x.with_reason(reason))
-                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
-    }
-
-    /// Removes a gc blocking reason for this timeline if one exists.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_remove_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue
-                    .clean
-                    .0
-                    .lineage
-                    .is_detached_from_original_ancestor()
-                {
-                    drop(guard);
-                    panic!("cannot complete timeline_ancestor_detach while not detached");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| {
-                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
-            };
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
-                    }
-
-                    upload_queue.dirty.gc_blocking =
-                        current.as_ref().and_then(|x| x.without_reason(reason));
-                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
-                    // FIXME: bogus ?
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
-    }
-
    /// Launch an upload operation in the background; the file is added to be included in next
    /// `index_part.json` upload.
    pub(crate) fn schedule_layer_file_upload(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,9 +60,6 @@ pub struct IndexPart {
    #[serde(default)]
    pub(crate) lineage: Lineage,

-    #[serde(skip_serializing_if = "Option::is_none", default)]
-    pub(crate) gc_blocking: Option<GcBlocking>,
-
    /// Describes the kind of aux files stored in the timeline.
    ///
    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -88,11 +85,10 @@ impl IndexPart {
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
    /// - 8: added `archived_at`
-    /// - 9: +gc_blocking
-    const LATEST_VERSION: usize = 9;
+    const LATEST_VERSION: usize = 8;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -105,7 +101,6 @@ impl IndexPart {
            deleted_at: None,
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        }
    }
@@ -256,64 +251,6 @@ impl Lineage {
    }
 }

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub(crate) struct GcBlocking {
-    pub(crate) started_at: NaiveDateTime,
-    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
-#[enumset(serialize_repr = "list")]
-pub(crate) enum GcBlockingReason {
-    Manual,
-    DetachAncestor,
-}
-
-impl GcBlocking {
-    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
-        GcBlocking {
-            started_at: chrono::Utc::now().naive_utc(),
-            reasons: enumset::EnumSet::only(reason),
-        }
-    }
-
-    /// Returns true if the given reason is one of the reasons why the gc is blocked.
-    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
-        self.reasons.contains(reason)
-    }
-
-    /// Returns a version of self with the given reason.
-    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
-        assert!(!self.blocked_by(reason));
-        let mut reasons = self.reasons;
-        reasons.insert(reason);
-
-        Self {
-            started_at: self.started_at,
-            reasons,
-        }
-    }
-
-    /// Returns a version of self without the given reason. Assumption is that if
-    /// there are no more reasons, we can unblock the gc by returning `None`.
-    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
-        assert!(self.blocked_by(reason));
-
-        if self.reasons.len() == 1 {
-            None
-        } else {
-            let mut reasons = self.reasons;
-            assert!(reasons.remove(reason));
-            assert!(!reasons.is_empty());
-
-            Some(Self {
-                started_at: self.started_at,
-                reasons,
-            })
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -355,7 +292,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -399,7 +335,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -444,7 +379,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -492,7 +426,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -535,7 +468,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -581,7 +513,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -632,7 +563,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

@@ -688,7 +618,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -745,7 +674,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -753,68 +681,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v9_indexpart_is_parsed() {
-        let example = r#"{
-            "version": 9,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata": {
-                "disk_consistent_lsn": "0/16960E8",
-                "prev_record_lsn": "0/1696070",
-                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/1696070",
-                "initdb_lsn": "0/1696070",
-                "pg_version": 14
-            },
-            "gc_blocking": {
-                "started_at": "2024-07-19T09:00:00.123",
-                "reasons": ["DetachAncestor"]
-            }
-        }"#;
-
-        let expected = IndexPart {
-            version: 9,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::new(
-                Lsn::from_str("0/16960E8").unwrap(),
-                Some(Lsn::from_str("0/1696070").unwrap()),
-                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
-                Lsn::INVALID,
-                Lsn::from_str("0/1696070").unwrap(),
-                Lsn::from_str("0/1696070").unwrap(),
-                14,
-            ).with_recalculated_checksum().unwrap(),
-            deleted_at: None,
-            lineage: Default::default(),
-            gc_blocking: Some(GcBlocking {
-                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
-                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
-            }),
-            last_aux_file_policy: Default::default(),
-            archived_at: None,
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,9 +8,6 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-#[cfg(test)]
-pub mod split_writer;
-
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
@@ -435,6 +432,21 @@ impl ReadableLayer {
    }
 }

+/// Return value from [`Layer::get_value_reconstruct_data`]
+#[derive(Clone, Copy, Debug)]
+pub enum ValueReconstructResult {
+    /// Got all the data needed to reconstruct the requested page
+    Complete,
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue,
+
+    /// This layer didn't contain data needed to reconstruct the page version at
+    /// the returned LSN. This is usually considered an error, but might be OK
+    /// in some circumstances.
+    Missing,
+}
+
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
@@ -539,25 +551,19 @@ impl LayerAccessStats {
        self.record_residence_event_at(SystemTime::now())
    }

-    fn record_access_at(&self, now: SystemTime) -> bool {
+    pub(crate) fn record_access_at(&self, now: SystemTime) {
        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);

        // A layer which is accessed must be visible.
        mask |= 0x1 << Self::VISIBILITY_SHIFT;
        value |= 0x1 << Self::VISIBILITY_SHIFT;

-        let old_bits = self.write_bits(mask, value);
-        !matches!(
-            self.decode_visibility(old_bits),
-            LayerVisibilityHint::Visible
-        )
+        self.write_bits(mask, value);
    }

-    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
-    /// as a result of this access
-    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
+    pub(crate) fn record_access(&self, ctx: &RequestContext) {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return false;
+            return;
        }

        self.record_access_at(SystemTime::now())
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,12 +36,13 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -71,7 +72,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};

 ///
 /// Header stored in the beginning of the file
@@ -196,6 +200,7 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
+    access_stats: LayerAccessStats,
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -294,6 +299,7 @@ impl DeltaLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
+        self.access_stats.record_access(ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -344,6 +350,7 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -366,6 +373,7 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
+    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
@@ -376,9 +384,6 @@ struct DeltaLayerWriterInner {
    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

    blob_writer: BlobWriter<true>,
-
-    // Number of key-lsns in the layer.
-    num_keys: usize,
 }

 impl DeltaLayerWriterInner {
@@ -412,6 +417,7 @@ impl DeltaLayerWriterInner {
        let tree_builder = DiskBtreeBuilder::new(block_buf);

        Ok(Self {
+            conf,
            path,
            timeline_id,
            tenant_shard_id,
@@ -419,7 +425,6 @@ impl DeltaLayerWriterInner {
            lsn_range,
            tree: tree_builder,
            blob_writer,
-            num_keys: 0,
        })
    }

@@ -470,9 +475,6 @@ impl DeltaLayerWriterInner {

        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
        let res = self.tree.append(&delta_key.0, blob_ref.0);
-
-        self.num_keys += 1;
-
        (val, res.map_err(|e| anyhow::anyhow!(e)))
    }

@@ -486,10 +488,11 @@ impl DeltaLayerWriterInner {
    async fn finish(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, ctx).await;
+        let result = self.finish0(key_end, timeline, ctx).await;
        if result.is_err() {
            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
            if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -502,8 +505,9 @@ impl DeltaLayerWriterInner {
    async fn finish0(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -568,9 +572,11 @@ impl DeltaLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        trace!("created delta layer {}", self.path);
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        Ok((desc, self.path))
+        trace!("created delta layer {}", layer.local_path());
+
+        Ok(layer)
    }
 }

@@ -671,20 +677,14 @@ impl DeltaLayerWriter {
    pub(crate) async fn finish(
        mut self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(key_end, ctx).await
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
-    #[cfg(test)]
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(key_end, timeline, ctx)
+            .await
    }
 }

@@ -808,6 +808,95 @@ impl DeltaLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let mut need_image = true;
+        // Scan the page versions backwards, starting from `lsn`.
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            &block_reader,
+        );
+        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+        tree_reader
+            .visit(
+                &search_key.0,
+                VisitDirection::Backwards,
+                |key, value| {
+                    let blob_ref = BlobRef(value);
+                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                        return false;
+                    }
+                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                    if entry_lsn < lsn_range.start {
+                        return false;
+                    }
+                    offsets.push((entry_lsn, blob_ref.pos()));
+
+                    !blob_ref.will_init()
+                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
+            )
+            .await?;
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        // Ok, 'offsets' now contains the offsets of all the entries we need to read
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        for (entry_lsn, pos) in offsets {
+            cursor
+                .read_blob_into_buf(pos, &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            match val {
+                Value::Image(img) => {
+                    reconstruct_state.img = Some((entry_lsn, img));
+                    need_image = false;
+                    break;
+                }
+                Value::WalRecord(rec) => {
+                    let will_init = rec.will_init();
+                    reconstruct_state.records.push((entry_lsn, rec));
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_image = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -1580,9 +1669,8 @@ pub(crate) mod test {
    use super::*;
    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
-    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::Tenant;
    use crate::{
        context::DownloadBehavior,
        task_mgr::TaskKind,
@@ -1876,8 +1964,9 @@ pub(crate) mod test {
            res?;
        }

-        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
-        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
+        let resident = writer
+            .finish(entries_meta.key_range.end, &timeline, &ctx)
+            .await?;

        let inner = resident.get_as_delta(&ctx).await?;

@@ -1957,7 +2046,6 @@ pub(crate) mod test {
            .await
            .likely_resident_layers()
            .next()
-            .cloned()
            .unwrap();

        {
@@ -2032,8 +2120,7 @@ pub(crate) mod test {
            .read()
            .await
            .likely_resident_layers()
-            .find(|&x| x != &initdb_layer)
-            .cloned()
+            .find(|x| x != &initdb_layer)
            .unwrap();

        // create a copy for the timeline, so we don't overwrite the file
@@ -2068,8 +2155,7 @@ pub(crate) mod test {
                .await
                .unwrap();

-            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
-            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
+            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();

            copied_layer.get_as_delta(ctx).await.unwrap();

@@ -2197,9 +2283,7 @@ pub(crate) mod test {
        for (key, lsn, value) in deltas {
            writer.put_value(key, lsn, value, ctx).await?;
        }
-
-        let (desc, path) = writer.finish(key_end, ctx).await?;
-        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
+        let delta_layer = writer.finish(key_end, tline, ctx).await?;

        Ok::<_, anyhow::Error>(delta_layer)
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,6 +32,9 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -134,6 +137,7 @@ pub struct ImageLayer {
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+    access_stats: LayerAccessStats,
    inner: OnceCell<ImageLayerInner>,
 }

@@ -251,6 +255,7 @@ impl ImageLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -301,6 +306,7 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -423,6 +429,46 @@ impl ImageLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader
+            .get(
+                &keybuf,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                    .build(),
+            )
+            .await?
+        {
+            let blob = block_reader
+                .block_cursor()
+                .read_blob(
+                    offset,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerValue)
+                        .build(),
+                )
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    pub(super) async fn get_values_reconstruct_data(
@@ -696,21 +742,11 @@ struct ImageLayerWriterInner {
    // where we have chosen their compressed form
    uncompressed_bytes_chosen: u64,

-    // Number of keys in the layer.
-    num_keys: usize,
-
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
-
-    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
-    last_written_key: Key,
 }

 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
    ///
    /// Start building a new image layer.
    ///
@@ -764,8 +800,6 @@ impl ImageLayerWriterInner {
            uncompressed_bytes: 0,
            uncompressed_bytes_eligible: 0,
            uncompressed_bytes_chosen: 0,
-            num_keys: 0,
-            last_written_key: Key::MIN,
        };

        Ok(writer)
@@ -786,7 +820,6 @@ impl ImageLayerWriterInner {
        let compression = self.conf.image_compression;
        let uncompressed_len = img.len() as u64;
        self.uncompressed_bytes += uncompressed_len;
-        self.num_keys += 1;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
@@ -806,11 +839,6 @@ impl ImageLayerWriterInner {
        key.write_to_byte_slice(&mut keybuf);
        self.tree.append(&keybuf, off)?;

-        #[cfg(feature = "testing")]
-        {
-            self.last_written_key = key;
-        }
-
        Ok(())
    }

@@ -821,7 +849,6 @@ impl ImageLayerWriterInner {
        self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        end_key: Option<Key>,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -872,23 +899,11 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            self.key_range.clone(),
            self.lsn,
            metadata.len(),
        );

-        #[cfg(feature = "testing")]
-        if let Some(end_key) = end_key {
-            assert!(
-                self.last_written_key < end_key,
-                "written key violates end_key range"
-            );
-        }
-
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -965,18 +980,6 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

-    #[cfg(test)]
-    /// Estimated size of the image layer.
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
    ///
    /// Finish writing the image layer.
    ///
@@ -985,26 +988,7 @@ impl ImageLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
-    }
-
-    #[cfg(test)]
-    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
-    pub(super) async fn finish_with_end_key(
-        mut self,
-        timeline: &Arc<Timeline>,
-        end_key: Key,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
-    }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
+        self.inner.take().unwrap().finish(timeline, ctx).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
+use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, Result};
-use camino::Utf8PathBuf;
+use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -34,7 +34,8 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
 };

 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
@@ -54,6 +55,9 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    pub(crate) end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
    frozen_local_path_str: OnceLock<Arc<str>>,

@@ -244,6 +248,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -293,6 +303,60 @@ impl InMemoryLayer {
        Ok(())
    }

+    /// Look up given value in the layer.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.start_lsn);
+        let mut need_image = true;
+
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+
+        let reader = inner.file.block_cursor();
+
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, pos) in slice.iter().rev() {
+                let buf = reader.read_blob(*pos, &ctx).await?;
+                let value = Value::des(&buf)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        // release lock on 'inner'
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -385,17 +449,20 @@ impl InMemoryLayer {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
@@ -415,7 +482,8 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+
+    pub(crate) async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
@@ -480,6 +548,8 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;
+
        assert!(
            self.start_lsn < end_lsn,
            "{} >= {}",
@@ -497,13 +567,9 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        #[cfg(debug_assertions)]
-        {
-            let inner = self.inner.write().await;
-            for vec_map in inner.index.values() {
-                for (lsn, _pos) in vec_map.as_slice() {
-                    assert!(*lsn < end_lsn);
-                }
+        for vec_map in inner.index.values() {
+            for (lsn, _pos) in vec_map.as_slice() {
+                assert!(*lsn < end_lsn);
            }
        }
    }
@@ -513,12 +579,12 @@ impl InMemoryLayer {
    /// if there are no matching keys.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub async fn write_to_disk(
+    pub(crate) async fn write_to_disk(
        &self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
-        l0_flush_global_state: &l0_flush::Inner,
-    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
+    ) -> Result<Option<ResidentLayer>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -530,8 +596,9 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;

+        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
        use l0_flush::Inner;
-        let _concurrency_permit = match l0_flush_global_state {
+        let _concurrency_permit = match &*l0_flush_global_state {
            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };
@@ -561,7 +628,7 @@ impl InMemoryLayer {
        )
        .await?;

-        match l0_flush_global_state {
+        match &*l0_flush_global_state {
            l0_flush::Inner::PageCached => {
                let ctx = RequestContextBuilder::extend(ctx)
                    .page_content_kind(PageContentKind::InMemoryLayer)
@@ -626,7 +693,7 @@ impl InMemoryLayer {
        }

        // MAX is used here because we identify L0 layers by full key range
-        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;

        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
        //
@@ -638,6 +705,6 @@ impl InMemoryLayer {
        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
        drop(_concurrency_permit);

-        Ok(Some((desc, path)))
+        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
+    ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -300,6 +301,42 @@ impl Layer {
        self.0.delete_on_drop();
    }

+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from the previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use anyhow::ensure;
+
+        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
+        self.0.access_stats.record_access(ctx);
+
+        if self.layer_desc().is_delta {
+            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
+            ensure!(self.layer_desc().key_range.contains(&key));
+        } else {
+            ensure!(self.layer_desc().key_range.contains(&key));
+            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
+            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
+        }
+
+        layer
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
+            .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
+    }
+
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -316,7 +353,7 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.record_access(ctx);
+        self.0.access_stats.record_access(ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,18 +433,18 @@ impl Layer {
        self.0.info(reset)
    }

-    pub(crate) fn latest_activity(&self) -> SystemTime {
-        self.0.access_stats.latest_activity()
-    }
-
-    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        self.0.access_stats.visibility()
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.0.access_stats
    }

    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.0.path
    }

+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -451,31 +488,13 @@ impl Layer {
        }
    }

-    fn record_access(&self, ctx: &RequestContext) {
-        if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
-            if let Some(tl) = self.0.timeline.upgrade() {
-                tl.metrics
-                    .visible_physical_size_gauge
-                    .add(self.0.desc.file_size)
-            }
-        }
-    }
-
    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
+        let old_visibility = self.access_stats().set_visibility(visibility.clone());
        use LayerVisibilityHint::*;
        match (old_visibility, visibility) {
            (Visible, Covered) => {
                // Subtract this layer's contribution to the visible size metric
                if let Some(tl) = self.0.timeline.upgrade() {
-                    debug_assert!(
-                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
-                    );
                    tl.metrics
                        .visible_physical_size_gauge
                        .sub(self.0.desc.file_size)
@@ -500,7 +519,7 @@ impl Layer {
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_values_reconstruct_data`].
+/// read with [`Layer::get_value_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -581,6 +600,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -693,9 +715,6 @@ impl Drop for LayerInner {
            }

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
-                debug_assert!(
-                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
-                );
                timeline
                    .metrics
                    .visible_physical_size_gauge
@@ -817,6 +836,9 @@ impl LayerInner {

        LayerInner {
            conf,
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1737,6 +1759,28 @@ impl DownloadedLayer {
            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
    }

+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => {
+                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
+                    .await
+            }
+        }
+    }
+
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -1835,7 +1879,7 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                self.owner.record_access(ctx);
+                owner.access_stats.record_access(ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -39,7 +39,7 @@ async fn smoke_test() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -50,26 +50,13 @@ async fn smoke_test() {
    // all layers created at pageserver are like `layer`, initialized with strong
    // Arc<DownloadedLayer>.

-    let controlfile_keyspace = KeySpace {
-        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
-    };
-
    let img_before = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
+        data.img
            .take()
            .expect("tenant harness writes the control file")
    };
@@ -87,24 +74,13 @@ async fn smoke_test() {

    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .instrument(download_span.clone())
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
-            .take()
-            .expect("tenant harness writes the control file")
+        data.img.take().unwrap()
    };

    assert_eq!(img_before, img_after);
@@ -176,7 +152,7 @@ async fn smoke_test() {
    {
        let layers = &[layer];
        let mut g = timeline.layers.write().await;
-        g.open_mut().unwrap().finish_gc_timeline(layers);
+        g.finish_gc_timeline(layers);
        // this just updates the remote_physical_size for demonstration purposes
        rtc.schedule_gc_update(layers).unwrap();
    }
@@ -216,7 +192,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -260,7 +236,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    // the deletion of the layer in remote_storage happens.
    {
        let mut layers = timeline.layers.write().await;
-        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
+        layers.finish_gc_timeline(&[layer]);
    }

    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -301,7 +277,7 @@ fn read_wins_pending_eviction() {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+                layers.likely_resident_layers().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -433,7 +409,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+                layers.likely_resident_layers().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -602,7 +578,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -682,7 +658,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -801,9 +777,9 @@ async fn eviction_cancellation_on_drop() {
    let (evicted_layer, not_evicted) = {
        let mut layers = {
            let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
+            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
            // remove the layers from layermap
-            guard.open_mut().unwrap().finish_gc_timeline(&layers);
+            guard.finish_gc_timeline(&layers);

            layers
        };
@@ -854,7 +830,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
    assert_eq!(size_of::<LayerAccessStats>(), 8);
    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 296);
+    assert_eq!(size_of::<LayerInner>(), 312);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,454 +0,0 @@
-use std::{ops::Range, sync::Arc};
-
-use bytes::Bytes;
-use pageserver_api::key::{Key, KEY_SIZE};
-use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
-
-use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
-
-use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
-
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
-#[must_use]
-pub struct SplitImageLayerWriter {
-    inner: ImageLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn: Lsn,
-}
-
-impl SplitImageLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn: Lsn,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: ImageLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                &(start_key..Key::MAX),
-                lsn,
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn,
-        })
-    }
-
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is an upper bound of the space that the key/image could take
-        // because we did not consider compression in this estimation. The resulting image layer
-        // could be smaller than the target size.
-        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_image_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                &(key..Key::MAX),
-                self.lsn,
-                ctx,
-            )
-            .await?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
-        }
-        self.inner.put_image(key, img, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
-#[must_use]
-pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn_range: Range<Lsn>,
-}
-
-impl SplitDeltaLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn_range: Range<Lsn>,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn_range,
-        })
-    }
-
-    pub async fn put_value(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_delta_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                key,
-                self.lsn_range.clone(),
-                ctx,
-            )
-            .await?;
-            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            self.generated_layers.push(delta_layer);
-        }
-        self.inner.put_value(key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-
-        let (desc, path) = inner.finish(end_key, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-        generated_layers.push(delta_layer);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::AsLayerDesc,
-        },
-        DEFAULT_PG_VERSION,
-    };
-
-    use super::*;
-
-    fn get_key(id: u32) -> Key {
-        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        key.field6 = id;
-        key
-    }
-
-    fn get_img(id: u32) -> Bytes {
-        format!("{id:064}").into()
-    }
-
-    fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
-    }
-
-    #[tokio::test]
-    async fn write_one_image() {
-        let harness = TenantHarness::create("split_writer_write_one_image")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-    }
-
-    #[tokio::test]
-    async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        const N: usize = 2000;
-        for i in 0..N {
-            let i = i as u32;
-            image_writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
-                .await
-                .unwrap();
-            delta_writer
-                .put_value(
-                    get_key(i),
-                    Lsn(0x20),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                )
-                .await
-                .unwrap();
-        }
-        let image_layers = image_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].layer_desc().key_range.end,
-                    image_layers[idx].layer_desc().key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].layer_desc().key_range.end,
-                    delta_layers[idx].layer_desc().key_range.start
-                );
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn write_large_img() {
-        let harness = TenantHarness::create("split_writer_write_large_img")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        image_writer
-            .put_image(get_key(1), get_large_img(), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        delta_writer
-            .put_value(
-                get_key(1),
-                Lsn(0x1A),
-                Value::Image(get_large_img()),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -407,16 +407,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        error_run_count += 1;
                        let wait_duration = Duration::from_secs_f64(wait_duration);

-                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
-                            // Timeline was cancelled during gc. We might either be in an event
-                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
-                            // or in one that affects the timeline only (timeline deletion).
-                            // Therefore, don't exit the loop.
-                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        } else {
-                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        }
-
+                        error!(
+                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    );
                        wait_duration
                    }
                }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,10 +19,8 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
-use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -43,7 +41,6 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
-use crate::walrecord::NeonWalRecord;

 use utils::lsn::Lsn;

@@ -76,7 +73,6 @@ impl KeyHistoryRetention {
        key: Key,
        delta_writer: &mut Vec<(Key, Lsn, Value)>,
        mut image_writer: Option<&mut ImageLayerWriter>,
-        stat: &mut CompactionStatistics,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
@@ -86,7 +82,6 @@ impl KeyHistoryRetention {
                    let Value::Image(img) = &logs[0].1 else {
                        unreachable!()
                    };
-                    stat.produce_image_key(img);
                    if let Some(image_writer) = image_writer.as_mut() {
                        image_writer.put_image(key, img.clone(), ctx).await?;
                    } else {
@@ -94,111 +89,24 @@ impl KeyHistoryRetention {
                    }
                } else {
                    for (lsn, val) in logs {
-                        stat.produce_key(&val);
                        delta_writer.push((key, lsn, val));
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
-                    stat.produce_key(&val);
                    delta_writer.push((key, lsn, val));
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
-            stat.produce_key(&val);
            delta_writer.push((key, lsn, val));
        }
        Ok(())
    }
 }

-#[derive(Debug, Serialize, Default)]
-struct CompactionStatisticsNumSize {
-    num: u64,
-    size: u64,
-}
-
-#[derive(Debug, Serialize, Default)]
-pub struct CompactionStatistics {
-    delta_layer_visited: CompactionStatisticsNumSize,
-    image_layer_visited: CompactionStatisticsNumSize,
-    delta_layer_produced: CompactionStatisticsNumSize,
-    image_layer_produced: CompactionStatisticsNumSize,
-    num_delta_layer_discarded: usize,
-    num_image_layer_discarded: usize,
-    num_unique_keys_visited: usize,
-    wal_keys_visited: CompactionStatisticsNumSize,
-    image_keys_visited: CompactionStatisticsNumSize,
-    wal_produced: CompactionStatisticsNumSize,
-    image_produced: CompactionStatisticsNumSize,
-}
-
-impl CompactionStatistics {
-    fn estimated_size_of_value(val: &Value) -> usize {
-        match val {
-            Value::Image(img) => img.len(),
-            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
-            _ => std::mem::size_of::<NeonWalRecord>(),
-        }
-    }
-    fn estimated_size_of_key() -> usize {
-        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
-    }
-    fn visit_delta_layer(&mut self, size: u64) {
-        self.delta_layer_visited.num += 1;
-        self.delta_layer_visited.size += size;
-    }
-    fn visit_image_layer(&mut self, size: u64) {
-        self.image_layer_visited.num += 1;
-        self.image_layer_visited.size += size;
-    }
-    fn on_unique_key_visited(&mut self) {
-        self.num_unique_keys_visited += 1;
-    }
-    fn visit_wal_key(&mut self, val: &Value) {
-        self.wal_keys_visited.num += 1;
-        self.wal_keys_visited.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn visit_image_key(&mut self, val: &Value) {
-        self.image_keys_visited.num += 1;
-        self.image_keys_visited.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn produce_key(&mut self, val: &Value) {
-        match val {
-            Value::Image(img) => self.produce_image_key(img),
-            Value::WalRecord(_) => self.produce_wal_key(val),
-        }
-    }
-    fn produce_wal_key(&mut self, val: &Value) {
-        self.wal_produced.num += 1;
-        self.wal_produced.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn produce_image_key(&mut self, val: &Bytes) {
-        self.image_produced.num += 1;
-        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn discard_delta_layer(&mut self) {
-        self.num_delta_layer_discarded += 1;
-    }
-    fn discard_image_layer(&mut self) {
-        self.num_image_layer_discarded += 1;
-    }
-    fn produce_delta_layer(&mut self, size: u64) {
-        self.delta_layer_produced.num += 1;
-        self.delta_layer_produced.size += size;
-    }
-    fn produce_image_layer(&mut self, size: u64) {
-        self.image_layer_produced.num += 1;
-        self.image_layer_produced.size += size;
-    }
-}
-
 impl Timeline {
    /// TODO: cancellation
    ///
@@ -210,18 +118,12 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, CompactionError> {
        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, flags, ctx)
+            self.compact_with_gc(cancel, ctx)
                .await
                .map_err(CompactionError::Other)?;
            return Ok(false);
        }

-        if flags.contains(CompactFlags::DryRun) {
-            return Err(CompactionError::Other(anyhow!(
-                "dry-run mode is not supported for legacy compaction for now"
-            )));
-        }
-
        // High level strategy for compaction / image creation:
        //
        // 1. First, calculate the desired "partitioning" of the
@@ -371,7 +273,7 @@ impl Timeline {
        );

        let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map()?.iter_historic_layers() {
+        for layer_desc in layers.layer_map().iter_historic_layers() {
            let layer = layers.get_from_desc(&layer_desc);
            if layer.metadata().shard.shard_count == self.shard_identity.count {
                // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -549,9 +451,7 @@ impl Timeline {
    ///
    /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
    /// that we know won't be needed for reads.
-    pub(super) async fn update_layer_visibility(
-        &self,
-    ) -> Result<(), super::layer_manager::Shutdown> {
+    pub(super) async fn update_layer_visibility(&self) {
        let head_lsn = self.get_last_record_lsn();

        // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
@@ -559,7 +459,7 @@ impl Timeline {
        // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
        // they will be subject to L0->L1 compaction in the near future.
        let layer_manager = self.layers.read().await;
-        let layer_map = layer_manager.layer_map()?;
+        let layer_map = layer_manager.layer_map();

        let readable_points = {
            let children = self.gc_info.read().unwrap().retain_lsns.clone();
@@ -582,7 +482,6 @@ impl Timeline {
        // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
        // avoid assuming that everything at a branch point is visible.
        drop(covered);
-        Ok(())
    }

    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
@@ -636,8 +535,12 @@ impl Timeline {
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
        stats.read_lock_held_spawn_blocking_startup_micros =
            stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map()?;
-        let level0_deltas = layers.level0_deltas();
+        let layers = guard.layer_map();
+        let level0_deltas = layers.get_level0_deltas();
+        let mut level0_deltas = level0_deltas
+            .into_iter()
+            .map(|x| guard.get_from_desc(&x))
+            .collect_vec();
        stats.level0_deltas_count = Some(level0_deltas.len());

        // Only compact if enough layers have accumulated.
@@ -650,11 +553,6 @@ impl Timeline {
            return Ok(CompactLevel0Phase1Result::default());
        }

-        let mut level0_deltas = level0_deltas
-            .iter()
-            .map(|x| guard.get_from_desc(x))
-            .collect::<Vec<_>>();
-
        // Gather the files to compact in this iteration.
        //
        // Start with the oldest Level 0 delta file, and collect any other
@@ -1108,16 +1006,14 @@ impl Timeline {
                        || contains_hole
                    {
                        // ... if so, flush previous layer and prepare to write new one
-                        let (desc, path) = writer
-                            .take()
-                            .unwrap()
-                            .finish(prev_key.unwrap().next(), ctx)
-                            .await
-                            .map_err(CompactionError::Other)?;
-                        let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
-                            .map_err(CompactionError::Other)?;
-
-                        new_layers.push(new_delta);
+                        new_layers.push(
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next(), self, ctx)
+                                .await
+                                .map_err(CompactionError::Other)?,
+                        );
                        writer = None;

                        if contains_hole {
@@ -1180,13 +1076,12 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            let (desc, path) = writer
-                .finish(prev_key.unwrap().next(), ctx)
-                .await
-                .map_err(CompactionError::Other)?;
-            let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
-                .map_err(CompactionError::Other)?;
-            new_layers.push(new_delta);
+            new_layers.push(
+                writer
+                    .finish(prev_key.unwrap().next(), self, ctx)
+                    .await
+                    .map_err(CompactionError::Other)?,
+            );
        }

        // Sync layers
@@ -1411,9 +1306,10 @@ impl Timeline {
        // Find the top of the historical layers
        let end_lsn = {
            let guard = self.layers.read().await;
-            let layers = guard.layer_map()?;
+            let layers = guard.layer_map();

-            let l0_deltas = layers.level0_deltas();
+            let l0_deltas = layers.get_level0_deltas();
+            drop(guard);

            // As an optimization, if we find that there are too few L0 layers,
            // bail out early. We know that the compaction algorithm would do
@@ -1745,47 +1641,38 @@ impl Timeline {
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
        cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        use std::collections::BTreeSet;

        // Block other compaction/GC tasks from running for now. GC-compaction could run along
-        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
-        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
+        // with legacy compaction tasks in the future.

-        let gc_lock = async {
-            tokio::select! {
-                guard = self.gc_lock.lock() => Ok(guard),
-                // TODO: refactor to CompactionError to correctly pass cancelled error
-                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
-            }
+        let _compaction_lock = tokio::select! {
+            guard = self.compaction_lock.lock() => guard,
+            // TODO: refactor to CompactionError to correctly pass cancelled error
+            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
        };

-        let gc_lock = crate::timed(
-            gc_lock,
-            "acquires gc lock",
-            std::time::Duration::from_secs(5),
-        )
-        .await?;
+        let _gc = tokio::select! {
+            guard = self.gc_lock.lock() => guard,
+            // TODO: refactor to CompactionError to correctly pass cancelled error
+            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
+        };

-        let dry_run = flags.contains(CompactFlags::DryRun);
-
-        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        info!("running enhanced gc bottom-most compaction");

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
        };

-        let mut stat = CompactionStatistics::default();
-
        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
            let guard = self.layers.read().await;
-            let layers = guard.layer_map()?;
+            let layers = guard.layer_map();
            let gc_info = self.gc_info.read().unwrap();
            let mut retain_lsns_below_horizon = Vec::new();
            let gc_cutoff = gc_info.cutoffs.select_min();
@@ -1849,9 +1736,6 @@ impl Timeline {
                let key_range = desc.get_key_range();
                delta_split_points.insert(key_range.start);
                delta_split_points.insert(key_range.end);
-                stat.visit_delta_layer(desc.file_size());
-            } else {
-                stat.visit_image_layer(desc.file_size());
            }
        }
        let mut delta_layers = Vec::new();
@@ -1887,8 +1771,6 @@ impl Timeline {
            tline: &Arc<Timeline>,
            lowest_retain_lsn: Lsn,
            ctx: &RequestContext,
-            stats: &mut CompactionStatistics,
-            dry_run: bool,
            last_batch: bool,
        ) -> anyhow::Result<Option<FlushDeltaResult>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1945,7 +1827,6 @@ impl Timeline {
                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
                    drop(guard);
                    if layer_generation == tline.generation {
-                        stats.discard_delta_layer();
                        // TODO: depending on whether we design this compaction process to run along with
                        // other compactions, there could be layer map modifications after we drop the
                        // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1972,16 +1853,9 @@ impl Timeline {
            for (key, lsn, val) in deltas {
                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
            }
-
-            stats.produce_delta_layer(delta_layer_writer.size());
-            if dry_run {
-                return Ok(None);
-            }
-
-            let (desc, path) = delta_layer_writer
-                .finish(delta_key.key_range.end, ctx)
+            let delta_layer = delta_layer_writer
+                .finish(delta_key.key_range.end, tline, ctx)
                .await?;
-            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
        }

@@ -2073,13 +1947,6 @@ impl Timeline {
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
        while let Some((key, lsn, val)) = merge_iter.next().await? {
-            if cancel.is_cancelled() {
-                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
-            }
-            match val {
-                Value::Image(_) => stat.visit_image_key(&val),
-                Value::WalRecord(_) => stat.visit_wal_key(&val),
-            }
            if last_key.is_none() || last_key.as_ref() == Some(&key) {
                if last_key.is_none() {
                    last_key = Some(key);
@@ -2087,7 +1954,6 @@ impl Timeline {
                accumulated_values.push((key, lsn, val));
            } else {
                let last_key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited();
                let retention = self
                    .generate_key_retention(
                        *last_key,
@@ -2104,7 +1970,6 @@ impl Timeline {
                        *last_key,
                        &mut delta_values,
                        image_layer_writer.as_mut(),
-                        &mut stat,
                        ctx,
                    )
                    .await?;
@@ -2117,8 +1982,6 @@ impl Timeline {
                        self,
                        lowest_retain_lsn,
                        ctx,
-                        &mut stat,
-                        dry_run,
                        false,
                    )
                    .await?,
@@ -2131,7 +1994,6 @@ impl Timeline {

        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
-        stat.on_unique_key_visited();
        let retention = self
            .generate_key_retention(
                last_key,
@@ -2148,7 +2010,6 @@ impl Timeline {
                last_key,
                &mut delta_values,
                image_layer_writer.as_mut(),
-                &mut stat,
                ctx,
            )
            .await?;
@@ -2161,8 +2022,6 @@ impl Timeline {
                self,
                lowest_retain_lsn,
                ctx,
-                &mut stat,
-                dry_run,
                true,
            )
            .await?,
@@ -2170,28 +2029,12 @@ impl Timeline {
        assert!(delta_values.is_empty(), "unprocessed keys");

        let image_layer = if discard_image_layer {
-            stat.discard_image_layer();
            None
        } else if let Some(writer) = image_layer_writer {
-            stat.produce_image_layer(writer.size());
-            if !dry_run {
-                Some(writer.finish(self, ctx).await?)
-            } else {
-                None
-            }
+            Some(writer.finish(self, ctx).await?)
        } else {
            None
        };
-
-        info!(
-            "gc-compaction statistics: {}",
-            serde_json::to_string(&stat)?
-        );
-
-        if dry_run {
-            return Ok(());
-        }
-
        info!(
            "produced {} delta layers and {} image layers",
            delta_layers.len(),
@@ -2215,19 +2058,14 @@ impl Timeline {
        let mut layer_selection = layer_selection;
        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
        compact_to.extend(image_layer);
-
        // Step 3: Place back to the layer map.
        {
            let mut guard = self.layers.write().await;
-            guard
-                .open_mut()?
-                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+            guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
        };
+
        self.remote_client
            .schedule_compaction_update(&layer_selection, &compact_to)?;
-
-        drop(gc_lock);
-
        Ok(())
    }
 }
@@ -2301,7 +2139,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
        self.flush_updates().await?;

        let guard = self.timeline.layers.read().await;
-        let layer_map = guard.layer_map()?;
+        let layer_map = guard.layer_map();

        let result = layer_map
            .iter_historic_layers()
@@ -2424,9 +2262,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
            ))
        });

-        let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
-        let new_delta_layer =
-            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
+        let new_delta_layer = writer
+            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
+            .await?;

        self.new_deltas.push(new_delta_layer);
        Ok(())
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,19 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    // Always ensure the lock order is compaction -> gc.
-    let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
-        compaction_lock,
-        "acquires compaction lock",
-        std::time::Duration::from_secs(5),
-    )
-    .await;
-
-    let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
-        gc_lock,
-        "acquires gc lock",
+    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
+    let guards = crate::timed(
+        guards,
+        "acquire gc and compaction locks",
        std::time::Duration::from_secs(5),
    )
    .await;
@@ -116,8 +107,7 @@ pub(super) async fn delete_local_timeline_directory(
        .context("fsync_pre_mark_remove")?;

    info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
+    drop(guards);

    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -230,8 +220,6 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

-        tenant.gc_block.before_delete(&timeline);
-
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashSet, sync::Arc};
+use std::sync::Arc;

 use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
@@ -74,11 +74,6 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
        Error::ShuttingDown
    }
 }
-impl From<super::layer_manager::Shutdown> for Error {
-    fn from(_: super::layer_manager::Shutdown) -> Self {
-        Error::ShuttingDown
-    }
-}

 impl From<FlushLayerError> for Error {
    fn from(value: FlushLayerError) -> Self {
@@ -146,9 +141,50 @@ pub(super) async fn prepare(
            }
        }

-        let reparented_timelines = reparented_direct_children(detached, tenant)?;
+        // detached has previously been detached; let's inspect each of the current timelines and
+        // report back the timelines which have been reparented by our detach
+        let mut all_direct_children = tenant
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
+            .map(|tl| (tl.ancestor_lsn, tl.clone()))
+            .collect::<Vec<_>>();
+
+        let mut any_shutdown = false;
+
+        all_direct_children.retain(
+            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
+                Ok(accessor) => accessor
+                    .latest_uploaded_index_part()
+                    .lineage
+                    .is_reparented(),
+                Err(_shutdownalike) => {
+                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
+                    // sharded enviroment.
+                    any_shutdown = true;
+                    true
+                }
+            },
+        );
+
+        if any_shutdown {
+            // it could be one or many being deleted; have client retry
+            return Err(Error::ShuttingDown);
+        }
+
+        let mut reparented = all_direct_children;
+        // why this instead of hashset? there is a reason, but I've forgotten it many times.
+        //
+        // maybe if this was a hashset we would not be able to distinguish some race condition.
+        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
+
        return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines,
+            reparented_timelines: reparented
+                .into_iter()
+                .map(|(_, tl)| tl.timeline_id)
+                .collect(),
        }));
    };

@@ -241,7 +277,7 @@ pub(super) async fn prepare(

        // between retries, these can change if compaction or gc ran in between. this will mean
        // we have to redo work.
-        partition_work(ancestor_lsn, &layers)?
+        partition_work(ancestor_lsn, &layers)
    };

    // TODO: layers are already sorted by something: use that to determine how much of remote
@@ -345,67 +381,16 @@ pub(super) async fn prepare(
    Ok(Progress::Prepared(guard, prepared))
 }

-fn reparented_direct_children(
-    detached: &Arc<Timeline>,
-    tenant: &Tenant,
-) -> Result<HashSet<TimelineId>, Error> {
-    let mut all_direct_children = tenant
-        .timelines
-        .lock()
-        .unwrap()
-        .values()
-        .filter_map(|tl| {
-            let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
-
-            if is_direct_child {
-                Some(tl.clone())
-            } else {
-                if let Some(timeline) = tl.ancestor_timeline.as_ref() {
-                    assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
-                }
-                None
-            }
-        })
-        // Collect to avoid lock taking order problem with Tenant::timelines and
-        // Timeline::remote_client
-        .collect::<Vec<_>>();
-
-    let mut any_shutdown = false;
-
-    all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
-        Ok(accessor) => accessor
-            .latest_uploaded_index_part()
-            .lineage
-            .is_reparented(),
-        Err(_shutdownalike) => {
-            // not 100% a shutdown, but let's bail early not to give inconsistent results in
-            // sharded enviroment.
-            any_shutdown = true;
-            true
-        }
-    });
-
-    if any_shutdown {
-        // it could be one or many being deleted; have client retry
-        return Err(Error::ShuttingDown);
-    }
-
-    Ok(all_direct_children
-        .into_iter()
-        .map(|tl| tl.timeline_id)
-        .collect())
-}
-
 fn partition_work(
    ancestor_lsn: Lsn,
-    source: &LayerManager,
-) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> {
+    source_layermap: &LayerManager,
+) -> (usize, Vec<Layer>, Vec<Layer>) {
    let mut straddling_branchpoint = vec![];
    let mut rest_of_historic = vec![];

    let mut later_by_lsn = 0;

-    for desc in source.layer_map()?.iter_historic_layers() {
+    for desc in source_layermap.layer_map().iter_historic_layers() {
        // off by one chances here:
        // - start is inclusive
        // - end is exclusive
@@ -424,10 +409,10 @@ fn partition_work(
            &mut rest_of_historic
        };

-        target.push(source.get_from_desc(&desc));
+        target.push(source_layermap.get_from_desc(&desc));
    }

-    Ok((later_by_lsn, straddling_branchpoint, rest_of_historic))
+    (later_by_lsn, straddling_branchpoint, rest_of_historic)
 }

 async fn upload_rewritten_layer(
@@ -503,12 +488,10 @@ async fn copy_lsn_prefix(
        // reuse the key instead of adding more holes between layers by using the real
        // highest key in the layer.
        let reused_highest_key = layer.layer_desc().key_range.end;
-        let (desc, path) = writer
-            .finish(reused_highest_key, ctx)
+        let copied = writer
+            .finish(reused_highest_key, target_timeline, ctx)
            .await
            .map_err(CopyDeltaPrefix)?;
-        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
-            .map_err(CopyDeltaPrefix)?;

        tracing::debug!(%layer, %copied, "new layer produced");

@@ -554,12 +537,11 @@ pub(super) async fn complete(
    tenant: &Tenant,
    prepared: PreparedTimelineDetach,
    _ctx: &RequestContext,
-) -> Result<HashSet<TimelineId>, anyhow::Error> {
+) -> Result<Vec<TimelineId>, anyhow::Error> {
    let PreparedTimelineDetach { layers } = prepared;

    let ancestor = detached
-        .ancestor_timeline
-        .as_ref()
+        .get_ancestor_timeline()
        .expect("must still have a ancestor");
    let ancestor_lsn = detached.get_ancestor_lsn();

@@ -599,7 +581,7 @@ pub(super) async fn complete(
            }

            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
+            let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;

            let is_deleting = tl
@@ -640,18 +622,13 @@ pub(super) async fn complete(
        });

    let reparenting_candidates = tasks.len();
-    let mut reparented = HashSet::with_capacity(tasks.len());
+    let mut reparented = Vec::with_capacity(tasks.len());

    while let Some(res) = tasks.join_next().await {
        match res {
            Ok(Some(timeline)) => {
                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-
-                assert!(
-                    reparented.insert(timeline.timeline_id),
-                    "duplicate reparenting? timeline_id={}",
-                    timeline.timeline_id
-                );
+                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
            }
            Ok(None) => {
                // lets just ignore this for now. one or all reparented timelines could had
@@ -673,5 +650,12 @@ pub(super) async fn complete(
        tracing::info!("failed to reparent some candidates");
    }

+    reparented.sort_unstable();
+
+    let reparented = reparented
+        .into_iter()
+        .map(|(_, timeline_id)| timeline_id)
+        .collect();
+
    Ok(reparented)
 }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -213,45 +213,51 @@ impl Timeline {
        let mut js = tokio::task::JoinSet::new();
        {
            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            for layer in layers.iter_historic_layers() {
+                let layer = guard.get_from_desc(&layer);

-            guard
-                .likely_resident_layers()
-                .filter(|layer| {
-                    let last_activity_ts = layer.latest_activity();
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.

-                    let no_activity_for = match now.duration_since(last_activity_ts) {
-                        Ok(d) => d,
-                        Err(_e) => {
-                            // We reach here if `now` < `last_activity_ts`, which can legitimately
-                            // happen if there is an access between us getting `now`, and us getting
-                            // the access stats from the layer.
-                            //
-                            // The other reason why it can happen is system clock skew because
-                            // SystemTime::now() is not monotonic, so, even if there is no access
-                            // to the layer after we get `now` at the beginning of this function,
-                            // it could be that `now`  < `last_activity_ts`.
-                            //
-                            // To distinguish the cases, we would need to record `Instant`s in the
-                            // access stats (i.e., monotonic timestamps), but then, the timestamps
-                            // values in the access stats would need to be `Instant`'s, and hence
-                            // they would be meaningless outside of the pageserver process.
-                            // At the time of writing, the trade-off is that access stats are more
-                            // valuable than detecting clock skew.
-                            return false;
-                        }
-                    };
+                if !layer.is_likely_resident() {
+                    continue;
+                }

-                    no_activity_for > p.threshold
-                })
-                .cloned()
-                .for_each(|layer| {
+                let last_activity_ts = layer.access_stats().latest_activity();
+
+                let no_activity_for = match now.duration_since(last_activity_ts) {
+                    Ok(d) => d,
+                    Err(_e) => {
+                        // We reach here if `now` < `last_activity_ts`, which can legitimately
+                        // happen if there is an access between us getting `now`, and us getting
+                        // the access stats from the layer.
+                        //
+                        // The other reason why it can happen is system clock skew because
+                        // SystemTime::now() is not monotonic, so, even if there is no access
+                        // to the layer after we get `now` at the beginning of this function,
+                        // it could be that `now`  < `last_activity_ts`.
+                        //
+                        // To distinguish the cases, we would need to record `Instant`s in the
+                        // access stats (i.e., monotonic timestamps), but then, the timestamps
+                        // values in the access stats would need to be `Instant`'s, and hence
+                        // they would be meaningless outside of the pageserver process.
+                        // At the time of writing, the trade-off is that access stats are more
+                        // valuable than detecting clock skew.
+                        continue;
+                    }
+                };
+
+                if no_activity_for > p.threshold {
                    js.spawn(async move {
                        layer
                            .evict_and_wait(std::time::Duration::from_secs(5))
                            .await
                    });
                    stats.candidates += 1;
-                });
+                }
+            }
        };

        let join_all = async move {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, ensure, Context, Result};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
@@ -24,142 +24,39 @@ use crate::{
 use super::TimelineWriterState;

 /// Provides semantic APIs to manipulate the layer map.
-pub(crate) enum LayerManager {
-    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
-    /// the layers.
-    Open(OpenLayerManager),
-    /// Shutdown layer manager where there are no more in-memory layers and persistent layers are
-    /// read-only.
-    Closed {
-        layers: HashMap<PersistentLayerKey, Layer>,
-    },
-}
-
-impl Default for LayerManager {
-    fn default() -> Self {
-        LayerManager::Open(OpenLayerManager::default())
-    }
+#[derive(Default)]
+pub(crate) struct LayerManager {
+    layer_map: LayerMap,
+    layer_fmgr: LayerFileManager<Layer>,
 }

 impl LayerManager {
-    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.layers()
-            .get(key)
-            .with_context(|| format!("get layer from key: {key}"))
-            .expect("not found")
-            .clone()
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+        self.layer_fmgr.get_from_desc(desc)
    }

-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
-        self.get_from_key(&desc.key())
+    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
+        self.layer_fmgr.get_from_key(desc)
    }

    /// Get an immutable reference to the layer map.
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
    /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
-            Closed { .. } => Err(Shutdown),
-        }
+    pub(crate) fn layer_map(&self) -> &LayerMap {
+        &self.layer_map
    }

-    pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
-        use LayerManager::*;
-
-        match self {
-            Open(open) => Ok(open),
-            Closed { .. } => Err(Shutdown),
-        }
-    }
-
-    /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
-    /// order to allow shutdown to complete.
-    ///
-    /// If there was a want to flush in-memory layers, it must have happened earlier.
-    pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager {
-                layer_map,
-                layer_fmgr: LayerFileManager(hashmap),
-            }) => {
-                let open = layer_map.open_layer.take();
-                let frozen = layer_map.frozen_layers.len();
-                let taken_writer_state = writer_state.take();
-                tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
-                let layers = std::mem::take(hashmap);
-                *self = Closed { layers };
-                assert_eq!(open.is_some(), taken_writer_state.is_some());
-            }
-            Closed { .. } => {
-                tracing::debug!("ignoring multiple shutdowns on layer manager")
-            }
-        }
-    }
-
-    /// Sum up the historic layer sizes
-    pub(crate) fn layer_size_sum(&self) -> u64 {
-        self.layers()
-            .values()
-            .map(|l| l.layer_desc().file_size)
-            .sum()
-    }
-
-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
-        self.layers().values().filter(|l| l.is_likely_resident())
-    }
-
-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
-        self.contains_key(&layer.layer_desc().key())
-    }
-
-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layers().contains_key(key)
-    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layers().keys().cloned().collect_vec()
-    }
-
-    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
-            Closed { layers } => layers,
-        }
-    }
-}
-
-#[derive(Default)]
-pub(crate) struct OpenLayerManager {
-    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
-}
-
-impl std::fmt::Debug for OpenLayerManager {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("OpenLayerManager")
-            .field("layer_count", &self.layer_fmgr.0.len())
-            .finish()
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-#[error("layer manager has been shutdown")]
-pub(crate) struct Shutdown;
-
-impl OpenLayerManager {
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
+    pub(crate) fn initialize_local_layers(
+        &mut self,
+        on_disk_layers: Vec<Layer>,
+        next_open_layer_at: Lsn,
+    ) {
        let mut updates = self.layer_map.batch_update();
-        for layer in layers {
+        for layer in on_disk_layers {
            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
@@ -171,19 +68,26 @@ impl OpenLayerManager {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

-    /// Open a new writable layer to append data if there is no open layer, otherwise return the
-    /// current open layer, called within `get_layer_for_write`.
+    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
+    /// called within `get_layer_for_write`.
    pub(crate) async fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
+        last_record_lsn: Lsn,
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<InMemoryLayer>> {
+    ) -> Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());

+        ensure!(
+            lsn > last_record_lsn,
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
+            lsn,
+            last_record_lsn,
+        );
+
        // Do we have a layer open for writing already?
        let layer = if let Some(open_layer) = &self.layer_map.open_layer {
            if open_layer.get_lsn_range().start > lsn {
@@ -209,15 +113,8 @@ impl OpenLayerManager {
                lsn
            );

-            let new_layer = InMemoryLayer::create(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_lsn,
-                gate_guard,
-                ctx,
-            )
-            .await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
@@ -271,7 +168,7 @@ impl OpenLayerManager {
        froze
    }

-    /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
+    /// Add image layers to the layer map, called from `create_image_layers`.
    pub(crate) fn track_new_image_layers(
        &mut self,
        image_layers: &[ResidentLayer],
@@ -344,7 +241,7 @@ impl OpenLayerManager {
        self.finish_compact_l0(compact_from, compact_to, metrics)
    }

-    /// Called post-compaction when some previous generation image layers were trimmed.
+    /// Called when compaction is completed.
    pub(crate) fn rewrite_layers(
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
@@ -362,10 +259,13 @@ impl OpenLayerManager {
                new_layer.layer_desc().lsn_range
            );

-            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
            // always marking rewritten layers as visible.
-            new_layer.as_ref().set_visibility(old_layer.visibility());
+            new_layer
+                .as_ref()
+                .access_stats()
+                .set_visibility(old_layer.access_stats().visibility());

            // Safety: we may never rewrite the same file in-place.  Callers are responsible
            // for ensuring that they only rewrite layers after something changes the path,
@@ -433,6 +333,31 @@ impl OpenLayerManager {
        mapping.remove(layer);
        layer.delete_on_drop();
    }
+
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
+        // for small layer maps, we most likely have all resident, but for larger more are likely
+        // to be evicted assuming lots of layers correlated with longer lifespan.
+
+        self.layer_map().iter_historic_layers().filter_map(|desc| {
+            self.layer_fmgr
+                .0
+                .get(&desc.key())
+                .filter(|l| l.is_likely_resident())
+                .cloned()
+        })
+    }
+
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+        self.layer_fmgr.contains(layer)
+    }
+
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layer_fmgr.contains_key(key)
+    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layer_fmgr.0.keys().cloned().collect_vec()
+    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -444,6 +369,24 @@ impl<T> Default for LayerFileManager<T> {
 }

 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
+    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.0
+            .get(key)
+            .with_context(|| format!("get layer from key: {}", key))
+            .expect("not found")
+            .clone()
+    }
+
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+        self.get_from_key(&desc.key())
+    }
+
+    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.0.contains_key(key)
+    }
+
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
@@ -451,6 +394,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        }
    }

+    pub(crate) fn contains(&self, layer: &T) -> bool {
+        self.0.contains_key(&layer.layer_desc().key())
+    }
+
    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,10 +122,6 @@ impl CurrentLogicalSize {
            Self::Exact(_) => Accuracy::Exact,
        }
    }
-
-    pub(crate) fn is_exact(&self) -> bool {
-        matches!(self, Self::Exact(_))
-    }
 }

 impl LogicalSize {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -30,12 +30,10 @@ use tokio::time::Instant;
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
 pub use io_engine::feature_test as io_engine_feature_test;
-pub use io_engine::io_engine_for_bench;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -328,29 +328,3 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
    .join()
    .unwrap()
 }
-
-/// For use in benchmark binaries only.
-///
-/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
-/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
-/// developer time trying to figure out why it's slow.
-///
-/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
-pub fn io_engine_for_bench() -> IoEngineKind {
-    #[cfg(not(target_os = "linux"))]
-    {
-        panic!("This benchmark does I/O and can only give a representative result on Linux");
-    }
-    #[cfg(target_os = "linux")]
-    {
-        match feature_test().unwrap() {
-            FeatureTestResult::PlatformPreferred(engine) => engine,
-            FeatureTestResult::Worse {
-                engine: _engine,
-                remark,
-            } => {
-                panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
-            }
-        }
-    }
-}
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -92,6 +92,7 @@ impl WalIngest {
        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<bool> {
+        eprintln!("ingest_record @ {lsn}");
        WAL_INGEST.records_received.inc();
        let pg_version = modification.tline.pg_version;
        let prev_len = modification.len();
--- a/poetry.lock
+++ b/poetry.lock
@@ -1514,20 +1514,6 @@ files = [
 [package.dependencies]
 six = "*"

-[[package]]
-name = "kafka-python"
-version = "2.0.2"
-description = "Pure Python client for Apache Kafka"
-optional = false
-python-versions = "*"
-files = [
-    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
-    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
-]
-
-[package.extras]
-crc32c = ["crc32c"]
-
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3371,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
+content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,7 +92,6 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
-try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -218,7 +218,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
    pub fn check_rate_limit(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        config: &AuthenticationConfig,
        secret: AuthSecret,
        endpoint: &EndpointId,
@@ -243,7 +243,7 @@ impl AuthenticationConfig {
        let limit_not_exceeded = self.rate_limiter.check(
            (
                endpoint_int,
-                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
            ),
            password_weight,
        );
@@ -274,7 +274,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    api: &impl console::Api,
    user_info: ComputeUserInfoMaybeEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -303,8 +303,8 @@ async fn auth_quirks(
    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;

    // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
    }

    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -356,7 +356,7 @@ async fn auth_quirks(
 }

 async fn authenticate_with_secret(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    secret: AuthSecret,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
        self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
@@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 impl BackendType<'_, ComputeUserInfo, &()> {
    pub async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {

    pub async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -571,7 +571,7 @@ mod tests {
    impl console::Api for Auth {
        async fn get_role_secret(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -579,7 +579,7 @@ mod tests {

        async fn get_allowed_ips_and_secret(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
        {
@@ -591,7 +591,7 @@ mod tests {

        async fn wake_compute(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
            unimplemented!()
@@ -665,7 +665,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -723,7 +723,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
@@ -742,7 +742,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -775,7 +775,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
@@ -794,7 +794,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -828,7 +828,7 @@ mod tests {
        ));

        let creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    creds: ComputeUserInfo,
    client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
        }
        AuthSecret::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret, ctx);
+            let scram = auth::Scram(&secret, &mut *ctx);

            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -18,7 +18,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

    let ep = EndpointIdInt::from(&info.endpoint);

@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    info: ComputeUserInfoNoEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 }

 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -84,7 +84,7 @@ pub fn endpoint_sni(

 impl ComputeUserInfoMaybeEndpoint {
    pub fn parse(
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        params: &StartupMessageParams,
        sni: Option<&str>,
        common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -264,8 +264,8 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -279,9 +279,9 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
        assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -311,8 +311,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -329,8 +329,8 @@ mod tests {
            ),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -344,8 +344,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -359,9 +359,9 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));

@@ -374,16 +374,16 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        Ok(())
@@ -397,9 +397,10 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
+        let mut ctx = RequestMonitoring::test();
+        let err =
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
+                .expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -416,9 +417,10 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let ctx = RequestMonitoring::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
+        let mut ctx = RequestMonitoring::test();
+        let err =
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
+                .expect_err("should fail");
        match err {
            UnknownCommonName { cn } => {
                assert_eq!(cn, "localhost");
@@ -436,9 +438,9 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
        assert_eq!(
            user_info.options.get_cache_key("project"),
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -27,7 +27,7 @@ pub trait AuthMethod {
 pub struct Begin;

 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);

 impl AuthMethod for Scram<'_> {
    #[inline(always)]
@@ -155,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        let Scram(secret, ctx) = self.state;

        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

        // Initial client message contains the chosen auth method's name.
        let msg = self.stream.read_password_message().await?;
@@ -168,8 +168,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        }

        match sasl.method {
-            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
-            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
+            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => {
+                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
+            }
            _ => {}
        }
        info!("client chooses {}", sasl.method);
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -205,7 +205,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";

 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    raw_stream: S,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
@@ -256,13 +256,13 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }

 async fn handle_client(
-    ctx: RequestMonitoring,
+    mut ctx: RequestMonitoring,
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;

    // Cut off first part of the SNI domain
    // We receive required destination details in the format of
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -5,7 +5,6 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
-use aws_config::Region;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::AuthRateLimiter;
@@ -291,10 +290,9 @@ async fn main() -> anyhow::Result<()> {
    let config = build_config(&args)?;

    info!("Authentication backend: {}", config.auth_backend);
-    info!("Using region: {}", args.aws_region);
+    info!("Using region: {}", config.aws_region);

-    let region_provider =
-        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
+    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
    let provider_conf =
        ProviderConfig::without_region().with_region(region_provider.region().await);
    let aws_credentials_provider = {
@@ -320,7 +318,7 @@ async fn main() -> anyhow::Result<()> {
    };
    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
        elasticache::AWSIRSAConfig::new(
-            args.aws_region.clone(),
+            config.aws_region.clone(),
            args.redis_cluster_name,
            args.redis_user_id,
        ),
@@ -378,14 +376,11 @@ async fn main() -> anyhow::Result<()> {

    let cancel_map = CancelMap::default();

-    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
-    RateBucketInfo::validate(redis_rps_limit)?;
-
    let redis_publisher = match &regional_redis_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
-            redis_rps_limit,
+            &config.redis_rps_limit,
        )?))),
        None => None,
    };
@@ -661,6 +656,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
+        request_timeout: args.sql_over_http.sql_over_http_timeout,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -680,6 +676,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
    };

+    let mut redis_rps_limit = args.redis_rps_limit.clone();
+    RateBucketInfo::validate(&mut redis_rps_limit)?;
+
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
@@ -688,8 +687,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        http_config,
        authentication_config,
        require_client_ip: args.require_client_ip,
+        disable_ip_check_for_http: args.disable_ip_check_for_http,
+        redis_rps_limit,
        handshake_timeout: args.handshake_timeout,
        region: args.region.clone(),
+        aws_region: args.aws_region.clone(),
        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
        connect_compute_locks,
        connect_to_compute_retry_config: config::RetryConfig::parse(
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -68,7 +68,7 @@ impl EndpointsCache {
            ready: AtomicBool::new(false),
        }
    }
-    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
        if !self.ready.load(Ordering::Acquire) {
            return true;
        }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -288,12 +288,12 @@ impl ConnCfg {
    /// Connect to a corresponding compute node.
    pub async fn connect(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        allow_self_signed_compute: bool,
        aux: MetricsAuxInfo,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
        drop(pause);

@@ -316,14 +316,14 @@ impl ConnCfg {
        )?;

        // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (client, connection) = self.0.connect_raw(stream, tls).await?;
        drop(pause);
        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
        let stream = connection.stream.into_inner();

        info!(
-            cold_start_info = ctx.cold_start_info().as_str(),
+            cold_start_info = ctx.cold_start_info.as_str(),
            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
            self.0.get_ssl_mode()
        );
@@ -342,7 +342,7 @@ impl ConnCfg {
            params,
            cancel_closure,
            aux,
-            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
        };

        Ok(connection)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -31,8 +31,11 @@ pub struct ProxyConfig {
    pub http_config: HttpConfig,
    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
+    pub disable_ip_check_for_http: bool,
+    pub redis_rps_limit: Vec<RateBucketInfo>,
    pub region: String,
    pub handshake_timeout: Duration,
+    pub aws_region: String,
    pub wake_compute_retry_config: RetryConfig,
    pub connect_compute_locks: ApiLocks<Host>,
    pub connect_to_compute_retry_config: RetryConfig,
@@ -52,6 +55,7 @@ pub struct TlsConfig {
 }

 pub struct HttpConfig {
+    pub request_timeout: tokio::time::Duration,
    pub pool_options: GlobalConnPoolOptions,
    pub cancel_set: CancelSet,
    pub client_conn_threshold: u64,
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -292,7 +292,7 @@ pub struct NodeInfo {
 impl NodeInfo {
    pub async fn connect(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        timeout: Duration,
    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
        self.config
@@ -330,20 +330,20 @@ pub(crate) trait Api {
    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
    async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
@@ -363,7 +363,7 @@ pub enum ConsoleBackend {
 impl Api for ConsoleBackend {
    async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        use ConsoleBackend::*;
@@ -378,7 +378,7 @@ impl Api for ConsoleBackend {

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
        use ConsoleBackend::*;
@@ -393,7 +393,7 @@ impl Api for ConsoleBackend {

    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
        use ConsoleBackend::*;
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -158,7 +158,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
        &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        Ok(CachedRoleSecret::new_uncached(
@@ -168,7 +168,7 @@ impl super::Api for Api {

    async fn get_allowed_ips_and_secret(
        &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        Ok((
@@ -182,7 +182,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &mut RequestMonitoring,
        _user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        self.do_wake_compute().map_ok(Cached::new_uncached).await
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -57,7 +57,7 @@ impl Api {

    async fn do_get_auth_info(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
        if !self
@@ -69,7 +69,7 @@ impl Api {
            info!("endpoint is not valid, skipping the request");
            return Ok(AuthInfo::default());
        }
-        let request_id = ctx.session_id().to_string();
+        let request_id = ctx.session_id.to_string();
        let application_name = ctx.console_application_name();
        async {
            let request = self
@@ -77,7 +77,7 @@ impl Api {
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
+                .query(&[("session_id", ctx.session_id)])
                .query(&[
                    ("application_name", application_name.as_str()),
                    ("project", user_info.endpoint.as_str()),
@@ -87,7 +87,7 @@ impl Api {

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
            let response = self.endpoint.execute(request).await?;
            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");
@@ -130,10 +130,10 @@ impl Api {

    async fn do_wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id().to_string();
+        let request_id = ctx.session_id.to_string();
        let application_name = ctx.console_application_name();
        async {
            let mut request_builder = self
@@ -141,7 +141,7 @@ impl Api {
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
+                .query(&[("session_id", ctx.session_id)])
                .query(&[
                    ("application_name", application_name.as_str()),
                    ("project", user_info.endpoint.as_str()),
@@ -156,7 +156,7 @@ impl Api {

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
            let response = self.endpoint.execute(request).await?;
            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");
@@ -192,7 +192,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        let normalized_ep = &user_info.endpoint.normalize();
@@ -226,7 +226,7 @@ impl super::Api for Api {

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        let normalized_ep = &user_info.endpoint.normalize();
@@ -268,7 +268,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        let key = user_info.endpoint_cache_key();
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,14 +7,13 @@ use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
 use tracing::{field::display, info, info_span, Span};
-use try_lock::TryLock;
 use uuid::Uuid;

 use crate::{
    console::messages::{ColdStartInfo, MetricsAuxInfo},
    error::ErrorKind,
    intern::{BranchIdInt, ProjectIdInt},
-    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
    DbName, EndpointId, RoleName,
 };

@@ -29,15 +28,7 @@ pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>>
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
 /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
-pub struct RequestMonitoring(
-    /// To allow easier use of the ctx object, we have interior mutability.
-    /// I would typically use a RefCell but that would break the `Send` requirements
-    /// so we need something with thread-safety. `TryLock` is a cheap alternative
-    /// that offers similar semantics to a `RefCell` but with synchronisation.
-    TryLock<RequestMonitoringInner>,
-);
-
-struct RequestMonitoringInner {
+pub struct RequestMonitoring {
    pub peer_addr: IpAddr,
    pub session_id: Uuid,
    pub protocol: Protocol,
@@ -94,7 +85,7 @@ impl RequestMonitoring {
            role = tracing::field::Empty,
        );

-        let inner = RequestMonitoringInner {
+        Self {
            peer_addr,
            session_id,
            protocol,
@@ -119,9 +110,7 @@ impl RequestMonitoring {
            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
            latency_timer: LatencyTimer::new(protocol),
            disconnect_timestamp: None,
-        };
-
-        Self(TryLock::new(inner))
+        }
    }

    #[cfg(test)]
@@ -130,177 +119,48 @@ impl RequestMonitoring {
    }

    pub fn console_application_name(&self) -> String {
-        let this = self.0.try_lock().expect("should not deadlock");
        format!(
            "{}/{}",
-            this.application.as_deref().unwrap_or_default(),
-            this.protocol
+            self.application.as_deref().unwrap_or_default(),
+            self.protocol
        )
    }

-    pub fn set_rejected(&self, rejected: bool) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        this.rejected = Some(rejected);
+    pub fn set_rejected(&mut self, rejected: bool) {
+        self.rejected = Some(rejected);
    }

-    pub fn set_cold_start_info(&self, info: ColdStartInfo) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .set_cold_start_info(info);
-    }
-
-    pub fn set_db_options(&self, options: StartupMessageParams) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        this.set_application(options.get("application_name").map(SmolStr::from));
-        if let Some(user) = options.get("user") {
-            this.set_user(user.into());
-        }
-        if let Some(dbname) = options.get("database") {
-            this.set_dbname(dbname.into());
-        }
-
-        this.pg_options = Some(options);
-    }
-
-    pub fn set_project(&self, x: MetricsAuxInfo) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        if this.endpoint_id.is_none() {
-            this.set_endpoint_id(x.endpoint_id.as_str().into())
-        }
-        this.branch = Some(x.branch_id);
-        this.project = Some(x.project_id);
-        this.set_cold_start_info(x.cold_start_info);
-    }
-
-    pub fn set_project_id(&self, project_id: ProjectIdInt) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        this.project = Some(project_id);
-    }
-
-    pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .set_endpoint_id(endpoint_id);
-    }
-
-    pub fn set_dbname(&self, dbname: DbName) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .set_dbname(dbname);
-    }
-
-    pub fn set_user(&self, user: RoleName) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .set_user(user);
-    }
-
-    pub fn set_auth_method(&self, auth_method: AuthMethod) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        this.auth_method = Some(auth_method);
-    }
-
-    pub fn has_private_peer_addr(&self) -> bool {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .has_private_peer_addr()
-    }
-
-    pub fn set_error_kind(&self, kind: ErrorKind) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        // Do not record errors from the private address to metrics.
-        if !this.has_private_peer_addr() {
-            Metrics::get().proxy.errors_total.inc(kind);
-        }
-        if let Some(ep) = &this.endpoint_id {
-            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
-            let label = metric.with_labels(kind);
-            metric.get_metric(label).measure(ep);
-        }
-        this.error_kind = Some(kind);
-    }
-
-    pub fn set_success(&self) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        this.success = true;
-    }
-
-    pub fn log_connect(&self) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .log_connect();
-    }
-
-    pub fn protocol(&self) -> Protocol {
-        self.0.try_lock().expect("should not deadlock").protocol
-    }
-
-    pub fn span(&self) -> Span {
-        self.0.try_lock().expect("should not deadlock").span.clone()
-    }
-
-    pub fn session_id(&self) -> Uuid {
-        self.0.try_lock().expect("should not deadlock").session_id
-    }
-
-    pub fn peer_addr(&self) -> IpAddr {
-        self.0.try_lock().expect("should not deadlock").peer_addr
-    }
-
-    pub fn cold_start_info(&self) -> ColdStartInfo {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .cold_start_info
-    }
-
-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
-        LatencyTimerPause {
-            ctx: self,
-            start: tokio::time::Instant::now(),
-            waiting_for,
-        }
-    }
-
-    pub fn success(&self) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .latency_timer
-            .success()
-    }
-}
-
-pub struct LatencyTimerPause<'a> {
-    ctx: &'a RequestMonitoring,
-    start: tokio::time::Instant,
-    waiting_for: Waiting,
-}
-
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        self.ctx
-            .0
-            .try_lock()
-            .expect("should not deadlock")
-            .latency_timer
-            .unpause(self.start, self.waiting_for);
-    }
-}
-
-impl RequestMonitoringInner {
-    fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
        self.cold_start_info = info;
        self.latency_timer.cold_start_info(info);
    }

-    fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+    pub fn set_db_options(&mut self, options: StartupMessageParams) {
+        self.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            self.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            self.set_dbname(dbname.into());
+        }
+
+        self.pg_options = Some(options);
+    }
+
+    pub fn set_project(&mut self, x: MetricsAuxInfo) {
+        if self.endpoint_id.is_none() {
+            self.set_endpoint_id(x.endpoint_id.as_str().into())
+        }
+        self.branch = Some(x.branch_id);
+        self.project = Some(x.project_id);
+        self.set_cold_start_info(x.cold_start_info);
+    }
+
+    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
+        self.project = Some(project_id);
+    }
+
+    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
        if self.endpoint_id.is_none() {
            self.span.record("ep", display(&endpoint_id));
            let metric = &Metrics::get().proxy.connecting_endpoints;
@@ -316,23 +176,44 @@ impl RequestMonitoringInner {
        }
    }

-    fn set_dbname(&mut self, dbname: DbName) {
+    pub fn set_dbname(&mut self, dbname: DbName) {
        self.dbname = Some(dbname);
    }

-    fn set_user(&mut self, user: RoleName) {
+    pub fn set_user(&mut self, user: RoleName) {
        self.span.record("role", display(&user));
        self.user = Some(user);
    }

-    fn has_private_peer_addr(&self) -> bool {
+    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
+        self.auth_method = Some(auth_method);
+    }
+
+    pub fn has_private_peer_addr(&self) -> bool {
        match self.peer_addr {
            IpAddr::V4(ip) => ip.is_private(),
            _ => false,
        }
    }

-    fn log_connect(&mut self) {
+    pub fn set_error_kind(&mut self, kind: ErrorKind) {
+        // Do not record errors from the private address to metrics.
+        if !self.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
+        if let Some(ep) = &self.endpoint_id {
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
+        }
+        self.error_kind = Some(kind);
+    }
+
+    pub fn set_success(&mut self) {
+        self.success = true;
+    }
+
+    pub fn log_connect(&mut self) {
        let outcome = if self.success {
            ConnectOutcome::Success
        } else {
@@ -375,7 +256,7 @@ impl RequestMonitoringInner {
    }
 }

-impl Drop for RequestMonitoringInner {
+impl Drop for RequestMonitoring {
    fn drop(&mut self) {
        if self.sender.is_some() {
            self.log_connect();
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -23,7 +23,7 @@ use utils::backoff;

 use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};

-use super::{RequestMonitoringInner, LOG_CHAN};
+use super::{RequestMonitoring, LOG_CHAN};

 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> {
    }
 }

-impl From<&RequestMonitoringInner> for RequestData {
-    fn from(value: &RequestMonitoringInner) -> Self {
+impl From<&RequestMonitoring> for RequestData {
+    fn from(value: &RequestMonitoring) -> Self {
        Self {
            session_id: value.session_id,
            peer_addr: value.peer_addr.to_string(),
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -370,7 +370,6 @@ pub struct CancellationRequest {
    pub kind: CancellationOutcome,
 }

-#[derive(Clone, Copy)]
 pub enum Waiting {
    Cplane,
    Client,
@@ -399,6 +398,12 @@ pub struct LatencyTimer {
    outcome: ConnectOutcome,
 }

+pub struct LatencyTimerPause<'a> {
+    timer: &'a mut LatencyTimer,
+    start: time::Instant,
+    waiting_for: Waiting,
+}
+
 impl LatencyTimer {
    pub fn new(protocol: Protocol) -> Self {
        Self {
@@ -412,13 +417,11 @@ impl LatencyTimer {
        }
    }

-    pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) {
-        let dur = start.elapsed();
-        match waiting_for {
-            Waiting::Cplane => self.accumulated.cplane += dur,
-            Waiting::Client => self.accumulated.client += dur,
-            Waiting::Compute => self.accumulated.compute += dur,
-            Waiting::RetryTimeout => self.accumulated.retry += dur,
+    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
+        LatencyTimerPause {
+            timer: self,
+            start: Instant::now(),
+            waiting_for,
        }
    }

@@ -435,6 +438,18 @@ impl LatencyTimer {
    }
 }

+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        let dur = self.start.elapsed();
+        match self.waiting_for {
+            Waiting::Cplane => self.timer.accumulated.cplane += dur,
+            Waiting::Client => self.timer.accumulated.client += dur,
+            Waiting::Compute => self.timer.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
+        }
+    }
+}
+
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
 pub enum ConnectOutcome {
    Success,
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -113,18 +113,18 @@ pub async fn task_main(
                }
            };

-            let ctx = RequestMonitoring::new(
+            let mut ctx = RequestMonitoring::new(
                session_id,
                peer_addr,
                crate::metrics::Protocol::Tcp,
                &config.region,
            );
-            let span = ctx.span();
+            let span = ctx.span.clone();

            let startup = Box::pin(
                handle_client(
                    config,
-                    &ctx,
+                    &mut ctx,
                    cancellation_handler,
                    socket,
                    ClientMode::Tcp,
@@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError {

 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    cancellation_handler: Arc<CancellationHandlerMain>,
    stream: S,
    mode: ClientMode,
@@ -248,25 +248,25 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
    info!(
-        protocol = %ctx.protocol(),
+        protocol = %ctx.protocol,
        "handling interactive connection from client"
    );

    let metrics = &Metrics::get().proxy;
-    let proto = ctx.protocol();
+    let proto = ctx.protocol;
    let _request_gauge = metrics.connection_requests.guard(proto);

    let tls = config.tls_config.as_ref();

    let record_handshake_error = !ctx.has_private_peer_addr();
-    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
+    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
    let (mut stream, params) =
        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
            HandshakeData::Startup(stream, params) => (stream, params),
            HandshakeData::Cancel(cancel_key_data) => {
                return Ok(cancellation_handler
-                    .cancel_session(cancel_key_data, ctx.session_id())
+                    .cancel_session(cancel_key_data, ctx.session_id)
                    .await
                    .map(|()| None)?)
            }
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -46,7 +46,7 @@ pub trait ConnectMechanism {
    type Error: From<Self::ConnectError>;
    async fn connect_once(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError>;
@@ -58,7 +58,7 @@ pub trait ConnectMechanism {
 pub trait ComputeConnectBackend {
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;

    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
@@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
    async fn connect_once(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<PostgresConnection, Self::Error> {
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    mechanism: &M,
    user_info: &B,
    allow_self_signed_compute: bool,
@@ -126,7 +126,7 @@ where
        .await
    {
        Ok(res) => {
-            ctx.success();
+            ctx.latency_timer.success();
            Metrics::get().proxy.retries_metric.observe(
                RetriesMetricGroup {
                    outcome: ConnectOutcome::Success,
@@ -178,7 +178,7 @@ where
            .await
        {
            Ok(res) => {
-                ctx.success();
+                ctx.latency_timer.success();
                Metrics::get().proxy.retries_metric.observe(
                    RetriesMetricGroup {
                        outcome: ConnectOutcome::Success,
@@ -209,7 +209,9 @@ where
        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
        num_retries += 1;

-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
        time::sleep(wait_duration).await;
        drop(pause);
    }
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -10,7 +10,6 @@ use tracing::{info, warn};
 use crate::{
    auth::endpoint_sni,
    config::{TlsConfig, PG_ALPN_PROTOCOL},
-    context::RequestMonitoring,
    error::ReportableError,
    metrics::Metrics,
    proxy::ERR_INSECURE_CONNECTION,
@@ -68,7 +67,6 @@ pub enum HandshakeData<S> {
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
    stream: S,
    mut tls: Option<&TlsConfig>,
    record_handshake_error: bool,
@@ -82,6 +80,8 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    let mut stream = PqStream::new(Stream::from_raw(stream));
    loop {
        let msg = stream.read_startup_packet().await?;
+        info!("received {msg:?}");
+
        use FeStartupPacket::*;
        match msg {
            SslRequest { direct } => match stream.get_ref() {
@@ -145,20 +145,16 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(

                        let conn_info = tls_stream.get_ref().1;

-                        // try parse endpoint
-                        let ep = conn_info
-                            .server_name()
-                            .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
-                        if let Some(ep) = ep {
-                            ctx.set_endpoint_id(ep);
-                        }
-
                        // check the ALPN, if exists, as required.
                        match conn_info.alpn_protocol() {
                            None | Some(PG_ALPN_PROTOCOL) => {}
                            Some(other) => {
+                                // try parse ep for better error
+                                let ep = conn_info.server_name().and_then(|sni| {
+                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
+                                });
                                let alpn = String::from_utf8_lossy(other);
-                                warn!(%alpn, "unexpected ALPN");
+                                warn!(?ep, %alpn, "unexpected ALPN");
                                return Err(HandshakeError::ProtocolViolation);
                            }
                        }
@@ -202,12 +198,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        .await?;
                }

-                info!(
-                    ?version,
-                    ?params,
-                    session_type = "normal",
-                    "successful handshake"
-                );
+                info!(?version, session_type = "normal", "successful handshake");
                break Ok(HandshakeData::Startup(stream, params));
            }
            // downgrade protocol version
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -155,7 +155,7 @@ impl TestAuth for Scram {
        stream: &mut PqStream<Stream<S>>,
    ) -> anyhow::Result<()> {
        let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0, &RequestMonitoring::test()))
+            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
            .await?
            .authenticate()
            .await?;
@@ -175,11 +175,10 @@ async fn dummy_proxy(
    auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
    let (client, _) = read_proxy_protocol(client).await?;
-    let mut stream =
-        match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? {
-            HandshakeData::Startup(stream, _) => stream,
-            HandshakeData::Cancel(_) => bail!("cancellation not supported"),
-        };
+    let mut stream = match handshake(client, tls.as_ref(), false).await? {
+        HandshakeData::Startup(stream, _) => stream,
+        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+    };

    auth.authenticate(&mut stream).await?;

@@ -458,7 +457,7 @@ impl ConnectMechanism for TestConnectMechanism {

    async fn connect_once(
        &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &mut RequestMonitoring,
        _node_info: &console::CachedNodeInfo,
        _timeout: std::time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
@@ -566,7 +565,7 @@ fn helper_create_connect_info(
 async fn connect_to_compute_success() {
    let _ = env_logger::try_init();
    use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
    let config = RetryConfig {
@@ -574,7 +573,7 @@ async fn connect_to_compute_success() {
        max_retries: 5,
        backoff_factor: 2.0,
    };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -584,7 +583,7 @@ async fn connect_to_compute_success() {
 async fn connect_to_compute_retry() {
    let _ = env_logger::try_init();
    use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
    let config = RetryConfig {
@@ -592,7 +591,7 @@ async fn connect_to_compute_retry() {
        max_retries: 5,
        backoff_factor: 2.0,
    };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -603,7 +602,7 @@ async fn connect_to_compute_retry() {
 async fn connect_to_compute_non_retry_1() {
    let _ = env_logger::try_init();
    use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
    let user_info = helper_create_connect_info(&mechanism);
    let config = RetryConfig {
@@ -611,7 +610,7 @@ async fn connect_to_compute_non_retry_1() {
        max_retries: 5,
        backoff_factor: 2.0,
    };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap_err();
    mechanism.verify();
@@ -622,7 +621,7 @@ async fn connect_to_compute_non_retry_1() {
 async fn connect_to_compute_non_retry_2() {
    let _ = env_logger::try_init();
    use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
    let config = RetryConfig {
@@ -630,7 +629,7 @@ async fn connect_to_compute_non_retry_2() {
        max_retries: 5,
        backoff_factor: 2.0,
    };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -642,7 +641,7 @@ async fn connect_to_compute_non_retry_3() {
    let _ = env_logger::try_init();
    tokio::time::pause();
    use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let mut ctx = RequestMonitoring::test();
    let mechanism =
        TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
    let user_info = helper_create_connect_info(&mechanism);
@@ -657,7 +656,7 @@ async fn connect_to_compute_non_retry_3() {
        backoff_factor: 2.0,
    };
    connect_to_compute(
-        &ctx,
+        &mut ctx,
        &mechanism,
        &user_info,
        false,
@@ -674,7 +673,7 @@ async fn connect_to_compute_non_retry_3() {
 async fn wake_retry() {
    let _ = env_logger::try_init();
    use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
    let config = RetryConfig {
@@ -682,7 +681,7 @@ async fn wake_retry() {
        max_retries: 5,
        backoff_factor: 2.0,
    };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -693,7 +692,7 @@ async fn wake_retry() {
 async fn wake_non_retry() {
    let _ = env_logger::try_init();
    use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
    let user_info = helper_create_connect_info(&mechanism);
    let config = RetryConfig {
@@ -701,7 +700,7 @@ async fn wake_non_retry() {
        max_retries: 5,
        backoff_factor: 2.0,
    };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap_err();
    mechanism.verify();
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -34,14 +34,9 @@ async fn proxy_mitm(
    tokio::spawn(async move {
        // begin handshake with end_server
        let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(
-            &RequestMonitoring::test(),
-            client1,
-            Some(&server_config1),
-            false,
-        )
-        .await
-        .unwrap()
+        let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
+            .await
+            .unwrap()
        {
            HandshakeData::Startup(stream, params) => (stream, params),
            HandshakeData::Cancel(_) => panic!("cancellation not supported"),
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -14,7 +14,7 @@ use super::connect_compute::ComputeConnectBackend;

 pub async fn wake_compute<B: ComputeConnectBackend>(
    num_retries: &mut u32,
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    api: &B,
    config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
@@ -52,7 +52,9 @@ pub async fn wake_compute<B: ComputeConnectBackend>(

        let wait_duration = retry_after(*num_retries, config);
        *num_retries += 1;
-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
        tokio::time::sleep(wait_duration).await;
        drop(pause);
    }
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -334,7 +334,7 @@ async fn request_handler(
            &config.region,
        );

-        let span = ctx.span();
+        let span = ctx.span.clone();
        info!(parent: &span, "performing websocket upgrade");

        let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
@@ -367,7 +367,7 @@ async fn request_handler(
            crate::metrics::Protocol::Http,
            &config.region,
        );
-        let span = ctx.span();
+        let span = ctx.span.clone();

        sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
            .instrument(span)
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -35,15 +35,15 @@ pub struct PoolingBackend {
 impl PoolingBackend {
    pub async fn authenticate(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        config: &AuthenticationConfig,
        conn_info: &ConnInfo,
    ) -> Result<ComputeCredentials, AuthError> {
        let user_info = conn_info.user_info.clone();
        let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
+        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
        }
        if !self
            .endpoint_rate_limiter
@@ -100,7 +100,7 @@ impl PoolingBackend {
    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
    pub async fn connect_to_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        conn_info: ConnInfo,
        keys: ComputeCredentials,
        force_new: bool,
@@ -222,7 +222,7 @@ impl ConnectMechanism for TokioMechanism {

    async fn connect_once(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        node_info: &CachedNodeInfo,
        timeout: Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
@@ -240,7 +240,7 @@ impl ConnectMechanism for TokioMechanism {
            .param("client_encoding", "UTF8")
            .expect("client encoding UTF8 is always valid");

-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let res = config.connect(tokio_postgres::NoTls).await;
        drop(pause);
        let (client, connection) = permit.release_result(res)?;
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -377,7 +377,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {

    pub fn get(
        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        conn_info: &ConnInfo,
    ) -> Result<Option<Client<C>>, HttpConnError> {
        let mut client: Option<ClientInner<C>> = None;
@@ -409,9 +409,9 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                    cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                    "pool: reusing connection '{conn_info}'"
                );
-                client.session.send(ctx.session_id())?;
+                client.session.send(ctx.session_id)?;
                ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-                ctx.success();
+                ctx.latency_timer.success();
                return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
            }
        }
@@ -465,19 +465,19 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {

 pub fn poll_client<C: ClientInnerExt>(
    global_pool: Arc<GlobalConnPool<C>>,
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    conn_info: ConnInfo,
    client: C,
    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
    conn_id: uuid::Uuid,
    aux: MetricsAuxInfo,
 ) -> Client<C> {
-    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
-    let mut session_id = ctx.session_id();
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
+    let mut session_id = ctx.session_id;
    let (tx, mut rx) = tokio::sync::watch::channel(session_id);

    let span = info_span!(parent: None, "connection", %conn_id);
-    let cold_start_info = ctx.cold_start_info();
+    let cold_start_info = ctx.cold_start_info;
    span.in_scope(|| {
        info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
    });
@@ -766,6 +766,7 @@ mod tests {
                opt_in: false,
                max_total_conns: 3,
            },
+            request_timeout: Duration::from_secs(1),
            cancel_set: CancelSet::new(0),
            client_conn_threshold: u64::MAX,
        }));
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -144,7 +144,7 @@ impl UserFacingError for ConnInfoError {
 }

 fn get_conn_info(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    headers: &HeaderMap,
    tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
@@ -224,12 +224,12 @@ fn get_conn_info(
 // TODO: return different http error codes
 pub async fn handle(
    config: &'static ProxyConfig,
-    ctx: RequestMonitoring,
+    mut ctx: RequestMonitoring,
    request: Request<Incoming>,
    backend: Arc<PoolingBackend>,
    cancel: CancellationToken,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
-    let result = handle_inner(cancel, config, &ctx, request, backend).await;
+    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;

    let mut response = match result {
        Ok(r) => {
@@ -482,16 +482,13 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue>
 async fn handle_inner(
    cancel: CancellationToken,
    config: &'static ProxyConfig,
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    request: Request<Incoming>,
    backend: Arc<PoolingBackend>,
 ) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
-    let _requeset_gauge = Metrics::get()
-        .proxy
-        .connection_requests
-        .guard(ctx.protocol());
+    let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
    info!(
-        protocol = %ctx.protocol(),
+        protocol = %ctx.protocol,
        "handling interactive connection from client"
    );

@@ -547,7 +544,7 @@ async fn handle_inner(
                .await?;
            // not strictly necessary to mark success here,
            // but it's just insurance for if we forget it somewhere else
-            ctx.success();
+            ctx.latency_timer.success();
            Ok::<_, HttpConnError>(client)
        }
        .map_err(SqlOverHttpError::from),
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -129,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {

 pub async fn serve_websocket(
    config: &'static ProxyConfig,
-    ctx: RequestMonitoring,
+    mut ctx: RequestMonitoring,
    websocket: OnUpgrade,
    cancellation_handler: Arc<CancellationHandlerMain>,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -145,7 +145,7 @@ pub async fn serve_websocket(

    let res = Box::pin(handle_client(
        config,
-        &ctx,
+        &mut ctx,
        cancellation_handler,
        WebSocketRw::new(websocket),
        ClientMode::Websockets { hostname },
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,8 @@
 [tool.poetry]
+name = "neon"
+version = "0.1.0"
 description = ""
 authors = []
-package-mode = false

 [tool.poetry.dependencies]
 python = "^3.9"
@@ -41,7 +42,6 @@ httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
-kafka-python = "^2.0.2"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -75,7 +75,6 @@ module = [
    "allure.*",
    "allure_commons.*",
    "allure_pytest.*",
-    "kafka.*",
 ]
 ignore_missing_imports = true

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -170,6 +170,11 @@ struct Args {
    /// still needed for existing replication connection.
    #[arg(long)]
    walsenders_keep_horizon: bool,
+    /// Enable partial backup. If disabled, safekeeper will not upload partial
+    /// segments to remote storage.
+    /// TODO: now partial backup is always enabled, remove this flag.
+    #[arg(long)]
+    partial_backup_enabled: bool,
    /// Controls how long backup will wait until uploading the partial segment.
    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
    partial_backup_timeout: Duration,
@@ -342,6 +347,7 @@ async fn main() -> anyhow::Result<()> {
        sk_auth_token,
        current_thread_runtime: args.current_thread_runtime,
        walsenders_keep_horizon: args.walsenders_keep_horizon,
+        partial_backup_enabled: true,
        partial_backup_timeout: args.partial_backup_timeout,
        disable_periodic_broker_push: args.disable_periodic_broker_push,
        enable_offload: args.enable_offload,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -21,7 +21,6 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod patch_control_file;
 pub mod pull_timeline;
-pub mod rate_limit;
 pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
@@ -54,7 +53,6 @@ pub mod defaults {
    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
    pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
    pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
-    pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;

    // By default, our required residency before eviction is the same as the period that passes
    // before uploading a partial segment, so that in normal operation the eviction can happen
@@ -93,6 +91,7 @@ pub struct SafeKeeperConf {
    pub sk_auth_token: Option<SecretString>,
    pub current_thread_runtime: bool,
    pub walsenders_keep_horizon: bool,
+    pub partial_backup_enabled: bool,
    pub partial_backup_timeout: Duration,
    pub disable_periodic_broker_push: bool,
    pub enable_offload: bool,
@@ -136,6 +135,7 @@ impl SafeKeeperConf {
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
            walsenders_keep_horizon: false,
+            partial_backup_enabled: false,
            partial_backup_timeout: Duration::from_secs(0),
            disable_periodic_broker_push: false,
            enable_offload: false,
--- a/safekeeper/src/rate_limit.rs
+++ b/safekeeper/src/rate_limit.rs
@@ -1,49 +0,0 @@
-use std::sync::Arc;
-
-use rand::Rng;
-
-use crate::metrics::MISC_OPERATION_SECONDS;
-
-/// Global rate limiter for background tasks.
-#[derive(Clone)]
-pub struct RateLimiter {
-    partial_backup: Arc<tokio::sync::Semaphore>,
-    eviction: Arc<tokio::sync::Semaphore>,
-}
-
-impl RateLimiter {
-    /// Create a new rate limiter.
-    /// - `partial_backup_max`: maximum number of concurrent partial backups.
-    /// - `eviction_max`: maximum number of concurrent timeline evictions.
-    pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
-        Self {
-            partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
-            eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
-        }
-    }
-
-    /// Get a permit for partial backup. This will block if the maximum number of concurrent
-    /// partial backups is reached.
-    pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
-        let _timer = MISC_OPERATION_SECONDS
-            .with_label_values(&["partial_permit_acquire"])
-            .start_timer();
-        self.partial_backup
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("semaphore is closed")
-    }
-
-    /// Try to get a permit for timeline eviction. This will return None if the maximum number of
-    /// concurrent timeline evictions is reached.
-    pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
-        self.eviction.clone().try_acquire_owned().ok()
-    }
-}
-
-/// Generate a random duration that is a fraction of the given duration.
-pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
-    let randf64 = rand::thread_rng().gen_range(0.0..1.0);
-    duration.mul_f64(randf64)
-}
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -25,7 +25,6 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

-use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
@@ -37,7 +36,7 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::PartialRemoteSegment;
+use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,6 +5,7 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use std::time::Instant;
 use tokio::{
    fs::File,
    io::{AsyncRead, AsyncWriteExt},
@@ -14,7 +15,6 @@ use utils::crashsafe::durable_rename;

 use crate::{
    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
-    rate_limit::rand_duration,
    timeline_manager::{Manager, StateSnapshot},
    wal_backup,
    wal_backup_partial::{self, PartialRemoteSegment},
@@ -50,6 +50,7 @@ impl Manager {
                .flush_lsn
                .segment_number(self.wal_seg_size)
                == self.last_removed_segno + 1
+            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
    }

    /// Evict the timeline to remote storage.
@@ -111,8 +112,7 @@ impl Manager {
            return;
        }

-        self.evict_not_before =
-            tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
+        self.resident_since = Instant::now();

        info!("successfully restored evicted timeline");
    }
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -23,7 +23,6 @@ use utils::lsn::Lsn;
 use crate::{
    control_file::{FileStorage, Storage},
    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
-    rate_limit::{rand_duration, RateLimiter},
    recovery::recovery_main,
    remove_wal::calc_horizon_lsn,
    safekeeper::Term,
@@ -33,7 +32,7 @@ use crate::{
    timeline_guard::{AccessService, GuardId, ResidenceGuard},
    timelines_set::{TimelineSetGuard, TimelinesSet},
    wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
    SafeKeeperConf,
 };

@@ -186,11 +185,11 @@ pub(crate) struct Manager {

    // misc
    pub(crate) access_service: AccessService,
-    pub(crate) global_rate_limiter: RateLimiter,
+    pub(crate) partial_backup_rate_limiter: RateLimiter,

    // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
    // evict them if they go inactive very soon after being restored.
-    pub(crate) evict_not_before: Instant,
+    pub(crate) resident_since: std::time::Instant,
 }

 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -203,7 +202,7 @@ pub async fn main_task(
    broker_active_set: Arc<TimelinesSet>,
    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
    mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
-    global_rate_limiter: RateLimiter,
+    partial_backup_rate_limiter: RateLimiter,
 ) {
    tli.set_status(Status::Started);

@@ -221,7 +220,7 @@ pub async fn main_task(
        conf,
        broker_active_set,
        manager_tx,
-        global_rate_limiter,
+        partial_backup_rate_limiter,
    )
    .await;

@@ -255,29 +254,9 @@ pub async fn main_task(
            mgr.set_status(Status::UpdatePartialBackup);
            mgr.update_partial_backup(&state_snapshot).await;

-            let now = Instant::now();
-            if mgr.evict_not_before > now {
-                // we should wait until evict_not_before
-                update_next_event(&mut next_event, mgr.evict_not_before);
-            }
-
-            if mgr.conf.enable_offload
-                && mgr.evict_not_before <= now
-                && mgr.ready_for_eviction(&next_event, &state_snapshot)
-            {
-                // check rate limiter and evict timeline if possible
-                match mgr.global_rate_limiter.try_acquire_eviction() {
-                    Some(_permit) => {
-                        mgr.set_status(Status::EvictTimeline);
-                        mgr.evict_timeline().await;
-                    }
-                    None => {
-                        // we can't evict timeline now, will try again later
-                        mgr.evict_not_before =
-                            Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
-                        update_next_event(&mut next_event, mgr.evict_not_before);
-                    }
-                }
+            if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
+                mgr.set_status(Status::EvictTimeline);
+                mgr.evict_timeline().await;
            }
        }

@@ -355,10 +334,11 @@ impl Manager {
        conf: SafeKeeperConf,
        broker_active_set: Arc<TimelinesSet>,
        manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
-        global_rate_limiter: RateLimiter,
+        partial_backup_rate_limiter: RateLimiter,
    ) -> Manager {
        let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
        Manager {
+            conf,
            wal_seg_size: tli.get_wal_seg_size().await,
            walsenders: tli.get_walsenders().clone(),
            state_version_rx: tli.get_state_version_rx(),
@@ -373,10 +353,8 @@ impl Manager {
            partial_backup_uploaded,
            access_service: AccessService::new(manager_tx),
            tli,
-            global_rate_limiter,
-            // to smooth out evictions spike after restart
-            evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
-            conf,
+            partial_backup_rate_limiter,
+            resident_since: std::time::Instant::now(),
        }
    }

@@ -544,8 +522,8 @@ impl Manager {

    /// Spawns partial WAL backup task if needed.
    async fn update_partial_backup(&mut self, state: &StateSnapshot) {
-        // check if WAL backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() {
+        // check if partial backup is enabled and should be started
+        if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled {
            return;
        }

@@ -563,7 +541,7 @@ impl Manager {
        self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
            self.wal_resident_timeline(),
            self.conf.clone(),
-            self.global_rate_limiter.clone(),
+            self.partial_backup_rate_limiter.clone(),
        )));
    }

--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -2,11 +2,10 @@
 //! All timelines should always be present in this map, this is done by loading them
 //! all from the disk on startup and keeping them in memory.

-use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
-use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -32,7 +31,7 @@ struct GlobalTimelinesState {
    conf: Option<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
-    global_rate_limiter: RateLimiter,
+    partial_backup_rate_limiter: RateLimiter,
 }

 // Used to prevent concurrent timeline loading.
@@ -51,7 +50,7 @@ impl GlobalTimelinesState {
        (
            self.get_conf().clone(),
            self.broker_active_set.clone(),
-            self.global_rate_limiter.clone(),
+            self.partial_backup_rate_limiter.clone(),
        )
    }

@@ -86,7 +85,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
        conf: None,
        broker_active_set: Arc::new(TimelinesSet::default()),
        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
-        global_rate_limiter: RateLimiter::new(1, 1),
+        partial_backup_rate_limiter: RateLimiter::new(1),
    })
 });

@@ -100,10 +99,7 @@ impl GlobalTimelines {
        // lock, so use explicit block
        let tenants_dir = {
            let mut state = TIMELINES_STATE.lock().unwrap();
-            state.global_rate_limiter = RateLimiter::new(
-                conf.partial_backup_concurrency,
-                DEFAULT_EVICTION_CONCURRENCY,
-            );
+            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
            state.conf = Some(conf);

            // Iterate through all directories and load tenants for all directories
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,6 +18,8 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.

+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -28,7 +30,6 @@ use utils::lsn::Lsn;

 use crate::{
    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
-    rate_limit::{rand_duration, RateLimiter},
    safekeeper::Term,
    timeline::WalResidentTimeline,
    timeline_manager::StateSnapshot,
@@ -36,6 +37,30 @@ use crate::{
    SafeKeeperConf,
 };

+#[derive(Clone)]
+pub struct RateLimiter {
+    semaphore: Arc<tokio::sync::Semaphore>,
+}
+
+impl RateLimiter {
+    pub fn new(permits: usize) -> Self {
+        Self {
+            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
+        }
+    }
+
+    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_permit_acquire"])
+            .start_timer();
+        self.semaphore
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("semaphore is closed")
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
    /// Upload is in progress. This status should be used only for garbage collection,
@@ -327,7 +352,6 @@ pub async fn main_task(
 ) -> Option<PartialRemoteSegment> {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;
-    let mut first_iteration = true;

    let (_, persistent_state) = tli.get_state().await;
    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
@@ -395,15 +419,6 @@ pub async fn main_task(
            }
        }

-        // smoothing the load after restart, by sleeping for a random time.
-        // if this is not the first iteration, we will wait for the full await_duration
-        let await_duration = if first_iteration {
-            first_iteration = false;
-            rand_duration(&await_duration)
-        } else {
-            await_duration
-        };
-
        // fixing the segno and waiting some time to prevent reuploading the same segment too often
        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
        let timeout = tokio::time::sleep(await_duration);
@@ -439,7 +454,7 @@ pub async fn main_task(
        }

        // limit concurrent uploads
-        let _upload_permit = limiter.acquire_partial_backup().await;
+        let _upload_permit = limiter.acquire_owned().await;

        let prepared = backup.prepare_upload().await;
        if let Some(seg) = &uploaded_segment {
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -181,6 +181,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        sk_auth_token: None,
        current_thread_runtime: false,
        walsenders_keep_horizon: false,
+        partial_backup_enabled: false,
        partial_backup_timeout: Duration::from_secs(0),
        disable_periodic_broker_push: false,
        enable_offload: false,
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -642,7 +642,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    logging::replace_panic_hook_with_tracing_panic_hook().forget();
    // initialize sentry if SENTRY_DSN is provided
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-    info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}");
+    info!("version: {GIT_VERSION}");
+    info!("build_tag: {BUILD_TAG}");
    metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // On any shutdown signal, log receival and exit.
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -32,7 +32,6 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,14 +9,12 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
-use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
    RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

@@ -88,10 +86,6 @@ struct Cli {
    // TODO: make `cfg(feature = "testing")`
    #[arg(long)]
    neon_local_repo_dir: Option<PathBuf>,
-
-    /// Chaos testing
-    #[arg(long)]
-    chaos_interval: Option<humantime::Duration>,
 }

 enum StrictMode {
@@ -315,22 +309,6 @@ async fn async_main() -> anyhow::Result<()> {
    tracing::info!("Serving on {0}", args.listen);
    let server_task = tokio::task::spawn(server);

-    let chaos_task = args.chaos_interval.map(|interval| {
-        let service = service.clone();
-        let cancel = CancellationToken::new();
-        let cancel_bg = cancel.clone();
-        (
-            tokio::task::spawn(
-                async move {
-                    let mut chaos_injector = ChaosInjector::new(service, interval.into());
-                    chaos_injector.run(cancel_bg).await
-                }
-                .instrument(tracing::info_span!("chaos_injector")),
-            ),
-            cancel,
-        )
-    });
-
    // Wait until we receive a signal
    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
@@ -359,12 +337,6 @@ async fn async_main() -> anyhow::Result<()> {
        }
    }

-    // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
-    if let Some((chaos_jh, chaos_cancel)) = chaos_task {
-        chaos_cancel.cancel();
-        chaos_jh.await.ok();
-    }
-
    service.shutdown().await;
    tracing::info!("Service shutdown complete");

--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -84,8 +84,6 @@ use crate::{
 };
 use serde::{Deserialize, Serialize};

-pub mod chaos_injector;
-
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);

@@ -2954,6 +2952,7 @@ impl Service {
        }

        // no shard needs to go first/last; the operation should be idempotent
+        // TODO: it would be great to ensure that all shards return the same error
        let mut results = self
            .tenant_for_shards(targets, |tenant_shard_id, node| {
                futures::FutureExt::boxed(detach_one(
@@ -2972,7 +2971,6 @@ impl Service {
            .filter(|(_, res)| res != &any.1)
            .collect::<Vec<_>>();
        if !mismatching.is_empty() {
-            // this can be hit by races which should not happen because operation lock on cplane
            let matching = results.len() - mismatching.len();
            tracing::error!(
                matching,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	417b1319cd	DNM debug	2024-08-02 13:29:24 +01:00
John Spray	c2aadb0c7f	wip2	2024-08-02 10:53:36 +01:00
John Spray	00648a9c49	pageserver: start refactoring ingest metadata ops	2024-08-02 10:02:26 +01:00