refacter : VirtualFile::open uses AsRef

2026-06-05 06:20:37 +00:00 · 2024-05-30 09:38:45 +09:00
127 changed files with 2244 additions and 4874 deletions
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -24,7 +24,7 @@ jobs:

  actionlint:
    needs: [ check-permissions ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: reviewdog/action-actionlint@v1
@@ -36,15 +36,3 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
-      - run: |
-          PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows
-          then
-            grep -ERl $PAT .github/workflows |\
-            while read -r f
-            do
-              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
-            done
-            exit 1
-          fi
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -44,7 +44,7 @@ jobs:
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -109,7 +109,7 @@ jobs:
      github.event.action == 'closed' &&
      github.event.pull_request.head.repo.full_name != github.repository

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -137,7 +137,7 @@ jobs:
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -88,7 +88,7 @@ jobs:

  merge-images:
    needs: [ build-image ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      IMAGE_TAG: ${{ inputs.image-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -35,7 +35,7 @@ jobs:
  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -549,7 +549,7 @@ jobs:
  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
    - uses: slackapi/slack-github-action@v1
@@ -774,7 +774,7 @@ jobs:

  neon-image:
    needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - uses: docker/login-action@v3
@@ -884,7 +884,7 @@ jobs:

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    strategy:
      matrix:
@@ -1032,7 +1032,7 @@ jobs:

  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      VERSIONS: v14 v15 v16
@@ -1077,7 +1077,7 @@ jobs:

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -19,7 +19,7 @@ permissions: {}

 jobs:
  check-image:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    outputs:
      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
      found: ${{ steps.check-image.outputs.found }}
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -16,7 +16,7 @@ permissions: {}

 jobs:
  check-permissions:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
    - name: Disallow CI runs on PRs from forks
      if: |
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  cleanup:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - name: Cleanup
        run: |
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
  test-postgres-client-libs:
    # TODO: switch to gen2 runner, requires docker
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]

    env:
      DEFAULT_PG_VERSION: 14
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -26,7 +26,7 @@ permissions: {}

 jobs:
  tag-image:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      FROM_TAG: ${{ inputs.from-tag }}
--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -19,7 +19,7 @@ on:

 jobs:
  notify:
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]

    steps:
      - uses: neondatabase/dev-actions/release-pr-notify@main
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,7 +26,7 @@ defaults:
 jobs:
  create-storage-release-branch:
    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
@@ -65,7 +65,7 @@ jobs:

  create-proxy-release-branch:
    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
  cancel-previous-e2e-tests:
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"

  tag:
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]
    outputs:
      build-tag: ${{ steps.build-tag.outputs.tag }}

@@ -62,7 +62,7 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2915,12 +2915,6 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"

-[[package]]
-name = "linux-raw-sys"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
-
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -3570,7 +3564,6 @@ dependencies = [
 "serde",
 "serde_json",
 "svg_fmt",
- "thiserror",
 "tokio",
 "tokio-util",
 "toml_edit",
@@ -4120,7 +4113,6 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -5129,7 +5121,6 @@ dependencies = [
 "futures-util",
 "hex",
 "histogram",
- "humantime",
 "itertools",
 "once_cell",
 "pageserver",
@@ -5820,7 +5811,6 @@ dependencies = [
 "anyhow",
 "clap",
 "comfy-table",
- "humantime",
 "hyper 0.14.26",
 "pageserver_api",
 "pageserver_client",
@@ -6166,7 +6156,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6678,12 +6668,11 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "bytes",
 "io-uring",
 "libc",
- "linux-raw-sys 0.6.4",
 ]

 [[package]]
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -89,7 +89,7 @@ RUN apt update && \
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /
@@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
+    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -149,7 +149,7 @@ RUN apt update && \

 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
-    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
+    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    # generate and copy upgrade scripts
    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
    cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
    echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
    cmake .. -DCMAKE_BUILD_TYPE=Release && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz

 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
    echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -243,12 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/pgvector.patch /pgvector.patch

-# By default, pgvector Makefile uses `-march=native`. We don't want that,
+# By default, pgvector Makefile uses `-march=native`. We don't want that, 
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
-    echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
-    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
+    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
+    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -266,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
    echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control

@@ -281,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
+    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
@@ -297,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
@@ -313,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
-    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
@@ -329,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
    echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
+    mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
@@ -345,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
+    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
@@ -361,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
+    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
@@ -377,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
+    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
@@ -393,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
@@ -424,7 +424,7 @@ RUN case "${PG_VERSION}" in \
    apt-get install -y cmake && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -462,7 +462,7 @@ RUN case "${PG_VERSION}" in \
    esac && \
    wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
    echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
@@ -481,7 +481,7 @@ RUN apt-get update && \
    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
+    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
@@ -505,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
-    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
@@ -531,7 +531,7 @@ RUN apt-get update && \
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
+    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
        -D RDK_BUILD_INCHI_SUPPORT=ON \
@@ -571,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
@@ -588,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
@@ -605,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
@@ -631,7 +631,7 @@ RUN case "${PG_VERSION}" in \
    esac && \
    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

@@ -647,7 +647,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
@@ -696,7 +696,7 @@ ARG PG_VERSION

 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -713,7 +713,7 @@ ARG PG_VERSION

 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
-    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
+    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    # it's needed to enable extension because it uses untrusted C language
@@ -733,7 +733,7 @@ ARG PG_VERSION
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
-    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

@@ -749,7 +749,7 @@ ARG PG_VERSION

 RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
-    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
@@ -771,7 +771,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

@@ -787,7 +787,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
@@ -804,7 +804,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,7 +9,6 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -7,9 +7,8 @@ use pageserver_api::{
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
-        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -126,28 +125,6 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
-    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
-    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
-    TenantDrop {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        unclean: bool,
-    },
-    NodeDrop {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        unclean: bool,
-    },
-    TenantSetTimeBasedEviction {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        period: humantime::Duration,
-        #[arg(long)]
-        threshold: humantime::Duration,
-    },
 }

 #[derive(Parser)]
@@ -697,46 +674,6 @@ async fn main() -> anyhow::Result<()> {
                }
            }
        }
-        Command::TenantDrop { tenant_id, unclean } => {
-            if !unclean {
-                anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant.  If you know what you're doing, add `--unclean` to proceed.")
-            }
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::POST,
-                    format!("debug/v1/tenant/{tenant_id}/drop"),
-                    None,
-                )
-                .await?;
-        }
-        Command::NodeDrop { node_id, unclean } => {
-            if !unclean {
-                anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it.  If you know what you're doing, add `--unclean` to proceed.")
-            }
-            storcon_client
-                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
-                .await?;
-        }
-        Command::TenantSetTimeBasedEviction {
-            tenant_id,
-            period,
-            threshold,
-        } => {
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: TenantConfig {
-                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
-                            EvictionPolicyLayerAccessThreshold {
-                                period: period.into(),
-                                threshold: threshold.into(),
-                            },
-                        )),
-                        ..Default::default()
-                    },
-                })
-                .await?;
-        }
    }

    Ok(())
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,7 +1,7 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::RepOriginId;
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
@@ -39,9 +39,6 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 /// The key prefix of AUX file keys.
 pub const AUX_KEY_PREFIX: u8 = 0x62;

-/// The key prefix of ReplOrigin keys.
-pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
-
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -56,8 +53,14 @@ impl Key {
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
        assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        // Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
-        Self::from_i128(i128::from_be_bytes(*key))
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
    }

    /// Encode a metadata key to a storage key.
@@ -65,6 +68,17 @@ impl Key {
        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
    }

+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
    /// Get the range of metadata keys.
    pub const fn metadata_key_range() -> Range<Self> {
        Key {
@@ -107,7 +121,7 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
@@ -161,7 +175,7 @@ impl Key {
    }

    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
+    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -174,7 +188,7 @@ impl Key {
    }

    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -385,14 +399,7 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
        field3: rel.dbnode,
        field4: rel.relnode,
        field5: rel.forknum,
-        field6: 0xffff_ffff,
-    }
-}
-
-impl Key {
-    #[inline(always)]
-    pub fn is_rel_size_key(&self) -> bool {
-        self.field1 == 0 && self.field6 == u32::MAX
+        field6: 0xffffffff,
    }
 }

@@ -433,25 +440,6 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
    }
 }

-#[inline(always)]
-pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
-    if key.field1 == 0x01
-        && key.field3 == 0
-        && key.field4 == 0
-        && key.field5 == 0
-        && key.field6 == 0
-    {
-        match key.field2 {
-            0 => Some(Ok(SlruKind::Clog)),
-            1 => Some(Ok(SlruKind::MultiXactMembers)),
-            2 => Some(Ok(SlruKind::MultiXactOffsets)),
-            x => Some(Err(x)),
-        }
-    } else {
-        None
-    }
-}
-
 #[inline(always)]
 pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
    Key {
@@ -480,17 +468,7 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
        field3: 1,
        field4: segno,
        field5: 0,
-        field6: 0xffff_ffff,
-    }
-}
-
-impl Key {
-    pub fn is_slru_segment_size_key(&self) -> bool {
-        self.field1 == 0x01
-            && self.field2 < 0x03
-            && self.field3 == 0x01
-            && self.field5 == 0
-            && self.field6 == u32::MAX
+        field6: 0xffffffff,
    }
 }

@@ -591,37 +569,6 @@ pub const AUX_FILES_KEY: Key = Key {
    field6: 2,
 };

-#[inline(always)]
-pub fn repl_origin_key(origin_id: RepOriginId) -> Key {
-    Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: origin_id as u32,
-    }
-}
-
-/// Get the range of replorigin keys.
-pub fn repl_origin_key_range() -> Range<Key> {
-    Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0x10000,
-    }
-}
-
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

@@ -630,78 +577,73 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
 pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();

-impl Key {
-    // AUX_FILES currently stores only data for logical replication (slots etc), and
-    // we don't preserve these on a branch because safekeepers can't follow timeline
-    // switch (and generally it likely should be optional), so ignore these.
-    #[inline(always)]
-    pub fn is_inherited_key(self) -> bool {
-        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
-    }
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+#[inline(always)]
+pub fn is_inherited_key(key: Key) -> bool {
+    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
+}

-    #[inline(always)]
-    pub fn is_rel_fsm_block_key(self) -> bool {
-        self.field1 == 0x00
-            && self.field4 != 0
-            && self.field5 == FSM_FORKNUM
-            && self.field6 != 0xffffffff
-    }
+#[inline(always)]
+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}

-    #[inline(always)]
-    pub fn is_rel_vm_block_key(self) -> bool {
-        self.field1 == 0x00
-            && self.field4 != 0
-            && self.field5 == VISIBILITYMAP_FORKNUM
-            && self.field6 != 0xffffffff
-    }
+#[inline(always)]
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}

-    #[inline(always)]
-    pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-        Ok(match self.field1 {
-            0x01 => {
-                let kind = match self.field2 {
-                    0x00 => SlruKind::Clog,
-                    0x01 => SlruKind::MultiXactMembers,
-                    0x02 => SlruKind::MultiXactOffsets,
-                    _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
-                };
-                let segno = self.field4;
-                let blknum = self.field6;
+#[inline(always)]
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+    Ok(match key.field1 {
+        0x01 => {
+            let kind = match key.field2 {
+                0x00 => SlruKind::Clog,
+                0x01 => SlruKind::MultiXactMembers,
+                0x02 => SlruKind::MultiXactOffsets,
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
+            };
+            let segno = key.field4;
+            let blknum = key.field6;

-                (kind, segno, blknum)
-            }
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
-        })
-    }
+            (kind, segno, blknum)
+        }
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}

-    #[inline(always)]
-    pub fn is_slru_block_key(self) -> bool {
-        self.field1 == 0x01                // SLRU-related
-        && self.field3 == 0x00000001   // but not SlruDir
-        && self.field6 != 0xffffffff // and not SlruSegSize
-    }
+#[inline(always)]
+pub fn is_slru_block_key(key: Key) -> bool {
+    key.field1 == 0x01                // SLRU-related
+        && key.field3 == 0x00000001   // but not SlruDir
+        && key.field6 != 0xffffffff // and not SlruSegSize
+}

-    #[inline(always)]
-    pub fn is_rel_block_key(&self) -> bool {
-        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
-    }
+#[inline(always)]
+pub fn is_rel_block_key(key: &Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
+}

-    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
-    #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
-        Ok(match self.field1 {
-            0x00 => (
-                RelTag {
-                    spcnode: self.field2,
-                    dbnode: self.field3,
-                    relnode: self.field4,
-                    forknum: self.field5,
-                },
-                self.field6,
-            ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
-        })
-    }
+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+#[inline(always)]
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
+    Ok(match key.field1 {
+        0x00 => (
+            RelTag {
+                spcnode: key.field2,
+                dbnode: key.field3,
+                relnode: key.field4,
+                forknum: key.field5,
+            },
+            key.field6,
+        ),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
 }

 impl std::str::FromStr for Key {
@@ -745,15 +687,10 @@ mod tests {
        let mut metadata_key = vec![AUX_KEY_PREFIX];
        metadata_key.extend_from_slice(&[0xFF; 15]);
        let encoded_key = Key::from_metadata_key(&metadata_key);
-        let output_key = encoded_key.to_i128().to_be_bytes();
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
        assert_eq!(metadata_key, output_key);
        assert!(encoded_key.is_metadata_key());
        assert!(is_metadata_key_slice(&metadata_key));
    }
-
-    #[test]
-    fn test_possible_largest_key() {
-        Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
-        // TODO: put this key into the system and see if anything breaks.
-    }
 }
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
 use std::fmt;

 use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
+use postgres_ffi::relfile_utils::forknumber_to_name;
 use postgres_ffi::Oid;

 ///
@@ -68,57 +68,6 @@ impl fmt::Display for RelTag {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub enum ParseRelTagError {
-    #[error("invalid forknum")]
-    InvalidForknum(#[source] std::num::ParseIntError),
-    #[error("missing triplet member {}", .0)]
-    MissingTripletMember(usize),
-    #[error("invalid triplet member {}", .0)]
-    InvalidTripletMember(usize, #[source] std::num::ParseIntError),
-}
-
-impl std::str::FromStr for RelTag {
-    type Err = ParseRelTagError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        use ParseRelTagError::*;
-
-        // FIXME: in postgres logs this separator is dot
-        // Example:
-        //     could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
-        // with a regex we could get this more painlessly
-        let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
-            Some((t, f)) => {
-                let forknum = forkname_to_number(Some(f));
-                let forknum = if let Ok(f) = forknum {
-                    f
-                } else {
-                    f.parse::<u8>().map_err(InvalidForknum)?
-                };
-
-                (t, Some(forknum))
-            }
-            None => (s, None),
-        };
-
-        let mut split = triplet
-            .splitn(3, '/')
-            .enumerate()
-            .map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
-        let spcnode = split.next().ok_or(MissingTripletMember(0))??;
-        let dbnode = split.next().ok_or(MissingTripletMember(1))??;
-        let relnode = split.next().ok_or(MissingTripletMember(2))??;
-
-        Ok(RelTag {
-            spcnode,
-            forknum: forknum.unwrap_or(MAIN_FORKNUM),
-            dbnode,
-            relnode,
-        })
-    }
-}
-
 impl RelTag {
    pub fn to_segfile_name(&self, segno: u32) -> String {
        let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,6 +1,9 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::{key::Key, models::ShardParameters};
+use crate::{
+    key::{is_rel_block_key, Key},
+    models::ShardParameters,
+};
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -425,12 +428,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);

-impl Default for ShardStripeSize {
-    fn default() -> Self {
-        DEFAULT_STRIPE_SIZE
-    }
-}
-
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
@@ -669,7 +666,7 @@ fn key_is_shard0(key: &Key) -> bool {
    // because they must be included in basebackups.
    let is_initfork = key.field5 == INIT_FORKNUM;

-    !key.is_rel_block_key() || is_initfork
+    !is_rel_block_key(key) || is_initfork
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
@@ -716,25 +713,6 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
    ShardNumber((hash % count.0 as u32) as u8)
 }

-/// For debugging, while not exposing the internals.
-#[derive(Debug)]
-#[allow(unused)] // used by debug formatting by pagectl
-struct KeyShardingInfo {
-    shard0: bool,
-    shard_number: ShardNumber,
-}
-
-pub fn describe(
-    key: &Key,
-    shard_count: ShardCount,
-    stripe_size: ShardStripeSize,
-) -> impl std::fmt::Debug {
-    KeyShardingInfo {
-        shard0: key_is_shard0(key),
-        shard_number: key_to_shard_number(shard_count, stripe_size, key),
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use utils::Hex;
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -126,7 +126,6 @@ fn main() -> anyhow::Result<()> {
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
            .allowlist_type("RelMapFile")
-            .allowlist_type("RepOriginId")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -110,7 +110,6 @@ pub mod pg_constants;
 pub mod relfile_utils;

 // Export some widely used datatypes that are unlikely to change across Postgres versions
-pub use v14::bindings::RepOriginId;
 pub use v14::bindings::{uint32, uint64, Oid};
 pub use v14::bindings::{BlockNumber, OffsetNumber};
 pub use v14::bindings::{MultiXactId, TransactionId};
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
 pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
 pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
 pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
-pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
+// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
 // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
 // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;

@@ -167,7 +167,6 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
-pub const RM_REPLORIGIN_ID: u8 = 19;
 pub const RM_LOGICALMSG_ID: u8 = 21;

 // from neon_rmgr.h
@@ -224,10 +223,6 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

-/* From xlog.h */
-pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
-pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
-
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
   + 64 /* NameData */  + 4*4;
@@ -242,9 +237,6 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
 pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
    (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)

-/* From origin.c */
-pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
-
 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
 pub const PGDATA_SUBDIRS: [&str; 22] = [
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 bytes.workspace = true
 byteorder.workspace = true
-itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,9 +7,8 @@ pub mod framed;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use itertools::Itertools;
 use serde::{Deserialize, Serialize};
-use std::{borrow::Cow, fmt, io, str};
+use std::{borrow::Cow, collections::HashMap, fmt, io, str};

 // re-export for use in utils pageserver_feedback.rs
 pub use postgres_protocol::PG_EPOCH;
@@ -51,37 +50,15 @@ pub enum FeStartupPacket {
    },
 }

-#[derive(Debug, Clone, Default)]
-pub struct StartupMessageParamsBuilder {
-    params: BytesMut,
-}
-
-impl StartupMessageParamsBuilder {
-    /// Set parameter's value by its name.
-    /// name and value must not contain a \0 byte
-    pub fn insert(&mut self, name: &str, value: &str) {
-        self.params.put(name.as_bytes());
-        self.params.put(&b"\0"[..]);
-        self.params.put(value.as_bytes());
-        self.params.put(&b"\0"[..]);
-    }
-
-    pub fn freeze(self) -> StartupMessageParams {
-        StartupMessageParams {
-            params: self.params.freeze(),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Default)]
+#[derive(Debug)]
 pub struct StartupMessageParams {
-    params: Bytes,
+    params: HashMap<String, String>,
 }

 impl StartupMessageParams {
    /// Get parameter's value by its name.
    pub fn get(&self, name: &str) -> Option<&str> {
-        self.iter().find_map(|(k, v)| (k == name).then_some(v))
+        self.params.get(name).map(|s| s.as_str())
    }

    /// Split command-line options according to PostgreSQL's logic,
@@ -135,19 +112,15 @@ impl StartupMessageParams {

    /// Iterate through key-value pairs in an arbitrary order.
    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        let params =
-            std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
-        params.split_terminator('\0').tuples()
+        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
    }

    // This function is mostly useful in tests.
    #[doc(hidden)]
    pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        let mut b = StartupMessageParamsBuilder::default();
-        for (k, v) in pairs {
-            b.insert(k, v)
+        Self {
+            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
        }
-        b.freeze()
    }
 }

@@ -372,21 +345,35 @@ impl FeStartupPacket {
            (major_version, minor_version) => {
                // StartupMessage

-                let s = str::from_utf8(&msg).map_err(|_e| {
-                    ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
-                })?;
-                let s = s.strip_suffix('\0').ok_or_else(|| {
-                    ProtocolError::Protocol(
-                        "StartupMessage params: missing null terminator".to_string(),
-                    )
-                })?;
+                // Parse pairs of null-terminated strings (key, value).
+                // See `postgres: ProcessStartupPacket, build_startup_packet`.
+                let mut tokens = str::from_utf8(&msg)
+                    .map_err(|_e| {
+                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                    })?
+                    .strip_suffix('\0') // drop packet's own null
+                    .ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: missing null terminator".to_string(),
+                        )
+                    })?
+                    .split_terminator('\0');
+
+                let mut params = HashMap::new();
+                while let Some(name) = tokens.next() {
+                    let value = tokens.next().ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: key without value".to_string(),
+                        )
+                    })?;
+
+                    params.insert(name.to_owned(), value.to_owned());
+                }

                FeStartupPacket::StartupMessage {
                    major_version,
                    minor_version,
-                    params: StartupMessageParams {
-                        params: msg.slice_ref(s.as_bytes()),
-                    },
+                    params: StartupMessageParams { params },
                }
            }
        };
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -26,14 +26,13 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
-use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

-use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
-    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
+    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
+    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    TimeTravelError, TimeoutOrCancel,
 };

 pub struct AzureBlobStorage {
@@ -138,8 +137,6 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

-        let started_at = start_measuring_requests(kind);
-
        let download = async {
            let response = builder
                // convert to concrete Pageable
@@ -203,22 +200,13 @@ impl AzureBlobStorage {
            })
        };

-        let download = tokio::select! {
+        tokio::select! {
            bufs = download => bufs,
            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
+                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
            },
-        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        let outcome = match &download {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
-        download
+        }
    }

    async fn permit(
@@ -352,10 +340,7 @@ impl RemoteStorage for AzureBlobStorage {
        metadata: Option<StorageMetadata>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Put;
-        let _permit = self.permit(kind, cancel).await?;
-
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Put, cancel).await?;

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -379,25 +364,14 @@ impl RemoteStorage for AzureBlobStorage {
            match fut.await {
                Ok(Ok(_response)) => Ok(()),
                Ok(Err(azure)) => Err(azure.into()),
-                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
            }
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let outcome = match res {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
-
-        res
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
+        }
    }

    async fn download(
@@ -443,13 +417,12 @@ impl RemoteStorage for AzureBlobStorage {
        paths: &'a [RemotePath],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Delete, cancel).await?;

        let op = async {
-            // TODO batch requests are not supported by the SDK
+            // TODO batch requests are also not supported by the SDK
            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
            for path in paths {
                let blob_client = self.client.blob_client(self.relative_path_to_name(path));

@@ -474,16 +447,10 @@ impl RemoteStorage for AzureBlobStorage {
            Ok(())
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-        res
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
+        }
    }

    async fn copy(
@@ -492,9 +459,7 @@ impl RemoteStorage for AzureBlobStorage {
        to: &RemotePath,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Copy;
-        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Copy, cancel).await?;

        let timeout = tokio::time::sleep(self.timeout);

@@ -538,21 +503,15 @@ impl RemoteStorage for AzureBlobStorage {
            }
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
            _ = timeout => {
                let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
                let e = e.context(format!("Timeout, last status: {copy_status:?}"));
                Err(e)
            },
-        };
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-        res
+        }
    }

    async fn time_travel_recover(
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,7 +12,6 @@
 mod azure_blob;
 mod error;
 mod local_fs;
-mod metrics;
 mod s3_bucket;
 mod simulate_failures;
 mod support;
@@ -122,8 +121,8 @@ impl RemotePath {
        self.0.file_name()
    }

-    pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
-        Self(self.0.join(path))
+    pub fn join(&self, segment: &Utf8Path) -> Self {
+        Self(self.0.join(segment))
    }

    pub fn get_path(&self) -> &Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,16 +46,15 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
-    error::Cancelled,
-    metrics::{start_counting_cancelled_wait, start_measuring_requests},
-    support::PermitCarrying,
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-use crate::metrics::AttemptOutcome;
-pub(super) use crate::metrics::RequestKind;
+pub(super) mod metrics;
+
+use self::metrics::AttemptOutcome;
+pub(super) use self::metrics::RequestKind;

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -228,7 +227,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);

@@ -249,7 +248,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);
        Ok(permit)
@@ -288,7 +287,7 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Ok,
                    started_at,
@@ -296,7 +295,7 @@ impl S3Bucket {
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Err,
                    started_at,
@@ -372,12 +371,12 @@ impl S3Bucket {
            };

            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &resp, started_at);

            let resp = resp.context("request deletion")?;
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .deleted_objects_total
                .inc_by(chunk.len() as u64);

@@ -436,14 +435,14 @@ pin_project_lite::pin_project! {
    /// Times and tracks the outcome of the request.
    struct TimedDownload<S> {
        started_at: std::time::Instant,
-        outcome: AttemptOutcome,
+        outcome: metrics::AttemptOutcome,
        #[pin]
        inner: S
    }

    impl<S> PinnedDrop for TimedDownload<S> {
        fn drop(mut this: Pin<&mut Self>) {
-            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
        }
    }
 }
@@ -452,7 +451,7 @@ impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
-            outcome: AttemptOutcome::Cancelled,
+            outcome: metrics::AttemptOutcome::Cancelled,
            inner,
        }
    }
@@ -469,8 +468,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
        let res = ready!(this.inner.poll_next(cx));
        match &res {
            Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
-            None => *this.outcome = AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
        }

        Poll::Ready(res)
@@ -544,7 +543,7 @@ impl RemoteStorage for S3Bucket {

            let started_at = ScopeGuard::into_inner(started_at);

-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &response, started_at);

@@ -626,7 +625,7 @@ impl RemoteStorage for S3Bucket {
        if let Ok(inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, inner, started_at);
        }
@@ -674,7 +673,7 @@ impl RemoteStorage for S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .req_seconds
            .observe_elapsed(kind, &res, started_at);

@@ -978,6 +977,28 @@ impl RemoteStorage for S3Bucket {
    }
 }

+/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
+fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
+fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
 // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
 struct VerOrDelete {
    kind: VerOrDeleteKind,
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -15,7 +15,6 @@ pub(crate) enum RequestKind {
    TimeTravel = 5,
 }

-use scopeguard::ScopeGuard;
 use RequestKind::*;

 impl RequestKind {
@@ -34,10 +33,10 @@ impl RequestKind {
    }
 }

-pub(crate) struct RequestTyped<C>([C; 6]);
+pub(super) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind) -> &C {
+    pub(super) fn get(&self, kind: RequestKind) -> &C {
        &self.0[kind.as_index()]
    }

@@ -59,19 +58,19 @@ impl<C> RequestTyped<C> {
 }

 impl RequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
        self.get(kind).observe(started_at.elapsed().as_secs_f64())
    }
 }

-pub(crate) struct PassFailCancelledRequestTyped<C> {
+pub(super) struct PassFailCancelledRequestTyped<C> {
    success: RequestTyped<C>,
    fail: RequestTyped<C>,
    cancelled: RequestTyped<C>,
 }

 #[derive(Debug, Clone, Copy)]
-pub(crate) enum AttemptOutcome {
+pub(super) enum AttemptOutcome {
    Ok,
    Err,
    Cancelled,
@@ -87,7 +86,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
 }

 impl AttemptOutcome {
-    pub(crate) fn as_str(&self) -> &'static str {
+    pub(super) fn as_str(&self) -> &'static str {
        match self {
            AttemptOutcome::Ok => "ok",
            AttemptOutcome::Err => "err",
@@ -97,7 +96,7 @@ impl AttemptOutcome {
 }

 impl<C> PassFailCancelledRequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
        let target = match outcome {
            AttemptOutcome::Ok => &self.success,
            AttemptOutcome::Err => &self.fail,
@@ -120,7 +119,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
 }

 impl PassFailCancelledRequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(
+    pub(super) fn observe_elapsed(
        &self,
        kind: RequestKind,
        outcome: impl Into<AttemptOutcome>,
@@ -131,44 +130,19 @@ impl PassFailCancelledRequestTyped<Histogram> {
    }
 }

-/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
-pub(crate) fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        crate::metrics::BUCKET_METRICS
-            .cancelled_waits
-            .get(kind)
-            .inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
-pub(crate) fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
-pub(crate) struct BucketMetrics {
+pub(super) struct BucketMetrics {
    /// Full request duration until successful completion, error or cancellation.
-    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
    /// Total amount of seconds waited on queue.
-    pub(crate) wait_seconds: RequestTyped<Histogram>,
+    pub(super) wait_seconds: RequestTyped<Histogram>,

    /// Track how many semaphore awaits were cancelled per request type.
    ///
    /// This is in case cancellations are happening more than expected.
-    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
+    pub(super) cancelled_waits: RequestTyped<IntCounter>,

    /// Total amount of deleted objects in batches or single requests.
-    pub(crate) deleted_objects_total: IntCounter,
+    pub(super) deleted_objects_total: IntCounter,
 }

 impl Default for BucketMetrics {
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -78,10 +78,6 @@ where
                let e = Err(std::io::Error::from(e));
                return Poll::Ready(Some(e));
            }
-        } else {
-            // this would be perfectly valid behaviour for doing a graceful completion on the
-            // download for example, but not one we expect to do right now.
-            tracing::warn!("continuing polling after having cancelled or timeouted");
        }

        this.inner.poll_next(cx)
@@ -93,22 +89,13 @@ where
 }

 /// Fires only on the first cancel or timeout, not on both.
-pub(crate) fn cancel_or_timeout(
+pub(crate) async fn cancel_or_timeout(
    timeout: Duration,
    cancel: CancellationToken,
-) -> impl std::future::Future<Output = TimeoutOrCancel> + 'static {
-    // futures are lazy, they don't do anything before being polled.
-    //
-    // "precalculate" the wanted deadline before returning the future, so that we can use pause
-    // failpoint to trigger a timeout in test.
-    let deadline = tokio::time::Instant::now() + timeout;
-    async move {
-        tokio::select! {
-            _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout,
-            _ = cancel.cancelled() => {
-                TimeoutOrCancel::Cancel
-            },
-        }
+) -> TimeoutOrCancel {
+    tokio::select! {
+        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
+        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
    }
 }

@@ -185,31 +172,4 @@ mod tests {
            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
        }
    }
-
-    #[tokio::test]
-    async fn notified_but_pollable_after() {
-        let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static(
-            b"hello world",
-        ))));
-        let timeout = Duration::from_secs(120);
-        let cancel = CancellationToken::new();
-
-        cancel.cancel();
-        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
-        let mut stream = std::pin::pin!(stream);
-
-        let next = stream.next().await;
-        let ioe = next.unwrap().unwrap_err();
-        assert!(
-            matches!(
-                ioe.get_ref().unwrap().downcast_ref::<DownloadError>(),
-                Some(&DownloadError::Cancelled)
-            ),
-            "{ioe:?}"
-        );
-
-        let next = stream.next().await;
-        let bytes = next.unwrap().unwrap();
-        assert_eq!(&b"hello world"[..], bytes);
-    }
 }
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,9 +3,6 @@ use std::{fs, io, path::Path};

 use anyhow::Context;

-mod rename_noreplace;
-pub use rename_noreplace::rename_noreplace;
-
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -1,109 +0,0 @@
-use nix::NixPath;
-
-/// Rename a file without replacing an existing file.
-///
-/// This is a wrapper around platform-specific APIs.
-pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
-    src: &P1,
-    dst: &P2,
-) -> nix::Result<()> {
-    {
-        #[cfg(target_os = "linux")]
-        {
-            nix::fcntl::renameat2(
-                None,
-                src,
-                None,
-                dst,
-                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
-            )
-        }
-        #[cfg(target_os = "macos")]
-        {
-            let res = src.with_nix_path(|src| {
-                dst.with_nix_path(|dst|
-                    // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np.
-                    unsafe {
-                        nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL)
-                })
-            })??;
-            nix::errno::Errno::result(res).map(drop)
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            std::compile_error!("OS does not support no-replace renames");
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::{fs, path::PathBuf};
-
-    use super::*;
-
-    fn testdir() -> camino_tempfile::Utf8TempDir {
-        match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") {
-            Some(path) => {
-                let path: camino::Utf8PathBuf = path;
-                camino_tempfile::tempdir_in(path).unwrap()
-            }
-            None => camino_tempfile::tempdir().unwrap(),
-        }
-    }
-
-    #[test]
-    fn test_absolute_paths() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        let src = testdir.path().join("src");
-        let dst = testdir.path().join("dst");
-
-        fs::write(&src, b"").unwrap();
-        fs::write(&dst, b"").unwrap();
-
-        let src = src.canonicalize().unwrap();
-        assert!(src.is_absolute());
-        let dst = dst.canonicalize().unwrap();
-        assert!(dst.is_absolute());
-
-        let result = rename_noreplace(&src, &dst);
-        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
-    }
-
-    #[test]
-    fn test_relative_paths() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        // this is fine because we run in nextest => process per test
-        std::env::set_current_dir(testdir.path()).unwrap();
-
-        let src = PathBuf::from("src");
-        let dst = PathBuf::from("dst");
-
-        fs::write(&src, b"").unwrap();
-        fs::write(&dst, b"").unwrap();
-
-        let result = rename_noreplace(&src, &dst);
-        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
-    }
-
-    #[test]
-    fn test_works_when_not_exists() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        let src = testdir.path().join("src");
-        let dst = testdir.path().join("dst");
-
-        fs::write(&src, b"content").unwrap();
-
-        rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap();
-        assert_eq!(
-            "content",
-            String::from_utf8(std::fs::read(&dst).unwrap()).unwrap()
-        );
-    }
-}
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -19,13 +19,13 @@
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
-pub struct Hex<S>(pub S);
+#[derive(PartialEq)]
+pub struct Hex<'a>(pub &'a [u8]);

-impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
+impl std::fmt::Debug for Hex<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[")?;
-        let chunks = self.0.as_ref().chunks(16);
-        for (i, c) in chunks.enumerate() {
+        for (i, c) in self.0.chunks(16).enumerate() {
            if i > 0 && !c.is_empty() {
                writeln!(f, ", ")?;
            }
@@ -36,15 +36,6 @@ impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
                write!(f, "0x{b:02x}")?;
            }
        }
-        write!(f, "; {}]", self.0.as_ref().len())
-    }
-}
-
-impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
-    fn eq(&self, other: &Hex<R>) -> bool {
-        let left = self.0.as_ref();
-        let right = other.0.as_ref();
-
-        left == right
+        write!(f, "; {}]", self.0.len())
    }
 }
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,7 +17,6 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
-thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -26,7 +26,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {

            let output = Output {
                layer_metadata: &des.layer_metadata,
-                disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
+                disk_consistent_lsn: des.get_disk_consistent_lsn(),
                timeline_metadata: &des.metadata,
            };

--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -1,475 +0,0 @@
-use anyhow::Context;
-use clap::Parser;
-use pageserver_api::{
-    key::Key,
-    reltag::{BlockNumber, RelTag, SlruKind},
-    shard::{ShardCount, ShardStripeSize},
-};
-use std::str::FromStr;
-
-#[derive(Parser)]
-pub(super) struct DescribeKeyCommand {
-    /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
-    input: Vec<String>,
-
-    /// The number of shards to calculate what Keys placement would be.
-    #[arg(long)]
-    shard_count: Option<CustomShardCount>,
-
-    /// The sharding stripe size.
-    ///
-    /// The default is hardcoded. It makes no sense to provide this without providing
-    /// `--shard-count`.
-    #[arg(long, requires = "shard_count")]
-    stripe_size: Option<u32>,
-}
-
-/// Sharded shard count without unsharded count, which the actual ShardCount supports.
-#[derive(Clone, Copy)]
-pub(super) struct CustomShardCount(std::num::NonZeroU8);
-
-#[derive(Debug, thiserror::Error)]
-pub(super) enum InvalidShardCount {
-    #[error(transparent)]
-    ParsingFailed(#[from] std::num::ParseIntError),
-    #[error("too few shards")]
-    TooFewShards,
-}
-
-impl FromStr for CustomShardCount {
-    type Err = InvalidShardCount;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let inner: std::num::NonZeroU8 = s.parse()?;
-        if inner.get() < 2 {
-            Err(InvalidShardCount::TooFewShards)
-        } else {
-            Ok(CustomShardCount(inner))
-        }
-    }
-}
-
-impl From<CustomShardCount> for ShardCount {
-    fn from(value: CustomShardCount) -> Self {
-        ShardCount::new(value.0.get())
-    }
-}
-
-impl DescribeKeyCommand {
-    pub(super) fn execute(self) {
-        let DescribeKeyCommand {
-            input,
-            shard_count,
-            stripe_size,
-        } = self;
-
-        let material = KeyMaterial::try_from(input.as_slice()).unwrap();
-        let kind = material.kind();
-        let key = Key::from(material);
-
-        println!("parsed from {kind}: {key}:");
-        println!();
-        println!("{key:?}");
-
-        macro_rules! kind_query {
-            ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
-            ($name:ident) => {{
-                let s: &'static str = stringify!($name);
-                let s = s.strip_prefix("is_").unwrap_or(s);
-                let s = s.strip_suffix("_key").unwrap_or(s);
-
-                #[allow(clippy::needless_borrow)]
-                (s, key.$name())
-            }};
-        }
-
-        // the current characterization is a mess of these boolean queries and separate
-        // "recognization". I think it accurately represents how strictly we model the Key
-        // right now, but could of course be made less confusing.
-
-        let queries = kind_query!([
-            is_rel_block_key,
-            is_rel_vm_block_key,
-            is_rel_fsm_block_key,
-            is_slru_block_key,
-            is_inherited_key,
-            is_rel_size_key,
-            is_slru_segment_size_key,
-        ]);
-
-        let recognized_kind = "recognized kind";
-        let metadata_key = "metadata key";
-        let shard_placement = "shard placement";
-
-        let longest = queries
-            .iter()
-            .map(|t| t.0)
-            .chain([recognized_kind, metadata_key, shard_placement])
-            .map(|s| s.len())
-            .max()
-            .unwrap();
-
-        let colon = 1;
-        let padding = 1;
-
-        for (name, is) in queries {
-            let width = longest - name.len() + colon + padding;
-            println!("{}{:width$}{}", name, ":", is);
-        }
-
-        let width = longest - recognized_kind.len() + colon + padding;
-        println!(
-            "{}{:width$}{:?}",
-            recognized_kind,
-            ":",
-            RecognizedKeyKind::new(key),
-        );
-
-        if let Some(shard_count) = shard_count {
-            // seeing the sharding placement might be confusing, so leave it out unless shard
-            // count was given.
-
-            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
-            println!(
-                "# placement with shard_count: {} and stripe_size: {}:",
-                shard_count.0, stripe_size.0
-            );
-            let width = longest - shard_placement.len() + colon + padding;
-            println!(
-                "{}{:width$}{:?}",
-                shard_placement,
-                ":",
-                pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
-            );
-        }
-    }
-}
-
-/// Hand-wavy "inputs we accept" for a key.
-#[derive(Debug)]
-pub(super) enum KeyMaterial {
-    Hex(Key),
-    String(SpanAttributesFromLogs),
-    Split(RelTag, BlockNumber),
-}
-
-impl KeyMaterial {
-    fn kind(&self) -> &'static str {
-        match self {
-            KeyMaterial::Hex(_) => "hex",
-            KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
-        }
-    }
-}
-
-impl From<KeyMaterial> for Key {
-    fn from(value: KeyMaterial) -> Self {
-        match value {
-            KeyMaterial::Hex(key) => key,
-            KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
-            | KeyMaterial::Split(rt, blocknum) => {
-                pageserver_api::key::rel_block_to_key(rt, blocknum)
-            }
-        }
-    }
-}
-
-impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
-    type Error = anyhow::Error;
-
-    fn try_from(value: &[S]) -> Result<Self, Self::Error> {
-        match value {
-            [] => anyhow::bail!(
-                "need 1..N positional arguments describing the key, try hex or a log line"
-            ),
-            [one] => {
-                let one = one.as_ref();
-
-                let key = Key::from_hex(one).map(KeyMaterial::Hex);
-
-                let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
-
-                match (key, attrs) {
-                    (Ok(key), _) => Ok(key),
-                    (_, Ok(s)) => Ok(s),
-                    (Err(e1), Err(e2)) => anyhow::bail!(
-                        "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
-                    ),
-                }
-            }
-            more => {
-                // assume going left to right one of these is a reltag and then we find a blocknum
-                // this works, because we don't have plain numbers at least right after reltag in
-                // logs. for some definition of "works".
-
-                let Some((reltag_at, reltag)) = more
-                    .iter()
-                    .map(AsRef::as_ref)
-                    .enumerate()
-                    .find_map(|(i, s)| {
-                        s.split_once("rel=")
-                            .map(|(_garbage, actual)| actual)
-                            .unwrap_or(s)
-                            .parse::<RelTag>()
-                            .ok()
-                            .map(|rt| (i, rt))
-                    })
-                else {
-                    anyhow::bail!("found no RelTag in arguments");
-                };
-
-                let Some(blocknum) = more
-                    .iter()
-                    .map(AsRef::as_ref)
-                    .skip(reltag_at)
-                    .find_map(|s| {
-                        s.split_once("blkno=")
-                            .map(|(_garbage, actual)| actual)
-                            .unwrap_or(s)
-                            .parse::<BlockNumber>()
-                            .ok()
-                    })
-                else {
-                    anyhow::bail!("found no blocknum in arguments");
-                };
-
-                Ok(KeyMaterial::Split(reltag, blocknum))
-            }
-        }
-    }
-}
-
-#[derive(Debug)]
-pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
-
-impl std::str::FromStr for SpanAttributesFromLogs {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // accept the span separator but do not require or fail if either is missing
-        // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
-        let (_, reltag) = s
-            .split_once("rel=")
-            .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
-        let reltag = reltag.split_whitespace().next().unwrap();
-
-        let (_, blocknum) = s
-            .split_once("blkno=")
-            .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
-        let blocknum = blocknum.split_whitespace().next().unwrap();
-
-        let reltag = reltag
-            .parse()
-            .with_context(|| format!("parse reltag from {reltag:?}"))?;
-        let blocknum = blocknum
-            .parse()
-            .with_context(|| format!("parse blocknum from {blocknum:?}"))?;
-
-        Ok(Self(reltag, blocknum))
-    }
-}
-
-#[derive(Debug)]
-#[allow(dead_code)] // debug print is used
-enum RecognizedKeyKind {
-    DbDir,
-    ControlFile,
-    Checkpoint,
-    AuxFilesV1,
-    SlruDir(Result<SlruKind, u32>),
-    RelMap(RelTagish<2>),
-    RelDir(RelTagish<2>),
-    AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
-}
-
-#[derive(Debug, PartialEq)]
-#[allow(unused)]
-enum AuxFileV2 {
-    Recognized(&'static str, utils::Hex<[u8; 13]>),
-    OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
-    Other(utils::Hex<[u8; 13]>),
-}
-
-impl RecognizedKeyKind {
-    fn new(key: Key) -> Option<Self> {
-        use RecognizedKeyKind::{
-            AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
-        };
-
-        let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
-
-        Some(match key {
-            pageserver_api::key::DBDIR_KEY => DbDir,
-            pageserver_api::key::CONTROLFILE_KEY => ControlFile,
-            pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
-            pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
-            _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
-            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
-                RelMap([key.field2, key.field3].into())
-            }
-            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
-                RelDir([key.field2, key.field3].into())
-            }
-            _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
-                AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
-            ),
-            _ => return None,
-        })
-    }
-}
-
-impl AuxFileV2 {
-    fn new(key: Key) -> Option<AuxFileV2> {
-        const EMPTY_HASH: [u8; 13] = {
-            let mut out = [0u8; 13];
-            let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
-            let mut i = 3;
-            while i < 16 {
-                out[i - 3] = hash[i];
-                i += 1;
-            }
-            out
-        };
-
-        let bytes = key.to_i128().to_be_bytes();
-        let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
-
-        assert_eq!(EMPTY_HASH.len(), hash.0.len());
-
-        // TODO: we could probably find the preimages for the hashes
-
-        Some(match (bytes[1], bytes[2]) {
-            (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
-            (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
-            (1, 3) if hash.0 == EMPTY_HASH => {
-                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
-            }
-            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
-            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
-            (0xff, 0xff) => AuxFileV2::Other(hash),
-            _ => return None,
-        })
-    }
-}
-
-/// Prefix of RelTag, currently only known use cases are the two item versions.
-///
-/// Renders like a reltag with `/`, nothing else.
-struct RelTagish<const N: usize>([u32; N]);
-
-impl<const N: usize> From<[u32; N]> for RelTagish<N> {
-    fn from(val: [u32; N]) -> Self {
-        RelTagish(val)
-    }
-}
-
-impl<const N: usize> std::fmt::Debug for RelTagish<N> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use std::fmt::Write as _;
-        let mut first = true;
-        self.0.iter().try_for_each(|x| {
-            if !first {
-                f.write_char('/')?;
-            }
-            first = false;
-            write!(f, "{}", x)
-        })
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver::aux_file::encode_aux_file_key;
-
-    use super::*;
-
-    #[test]
-    fn hex_is_key_material() {
-        let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
-        assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
-    }
-
-    #[test]
-    fn single_positional_spanalike_is_key_material() {
-        // why is this needed? if you are checking many, then copypaste starts to appeal
-        let strings = [
-            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
-            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
-            (line!(), "rel=1663/208101/2620.1 blkno=2"),
-        ];
-
-        let mut first: Option<Key> = None;
-
-        for (line, example) in strings {
-            let m = KeyMaterial::try_from(&[example][..])
-                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
-            let key = Key::from(m);
-            if let Some(first) = first {
-                assert_eq!(first, key);
-            } else {
-                first = Some(key);
-            }
-        }
-
-        // not supporting this is rather accidential, but I think the input parsing is lenient
-        // enough already
-        KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
-    }
-
-    #[test]
-    fn multiple_spanlike_args() {
-        let strings = [
-            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
-            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
-            (line!(), &["1663/208101/2620_fsm", "2"][..]),
-        ];
-
-        let mut first: Option<Key> = None;
-
-        for (line, example) in strings {
-            let m = KeyMaterial::try_from(example)
-                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
-            let key = Key::from(m);
-            if let Some(first) = first {
-                assert_eq!(first, key);
-            } else {
-                first = Some(key);
-            }
-        }
-    }
-    #[test]
-    fn recognized_auxfiles() {
-        use AuxFileV2::*;
-
-        let empty = [
-            0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
-        ];
-        let foobar = [
-            0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
-        ];
-
-        #[rustfmt::skip]
-        let examples = [
-            (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
-            (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
-            (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
-            (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
-            (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
-            (line!(), "foobar", Other(utils::Hex(foobar))),
-        ];
-
-        for (line, path, expected) in examples {
-            let key = encode_aux_file_key(path);
-            let recognized =
-                AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
-
-            assert_eq!(recognized, expected);
-        }
-
-        assert_eq!(
-            AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
-            None,
-            "example key has one too few 0 after 6 before 1"
-        );
-    }
-}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -6,7 +6,6 @@

 mod draw_timeline_dir;
 mod index_part;
-mod key;
 mod layer_map_analyzer;
 mod layers;

@@ -62,8 +61,6 @@ enum Commands {
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
    #[command(subcommand)]
    Layer(LayerCmd),
-    /// Debug print a hex key found from logs
-    Key(key::DescribeKeyCommand),
 }

 /// Read and update pageserver metadata file
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                .await?;
        }
-        Commands::Key(dkc) => dkc.execute(),
    };
    Ok(())
 }
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -5,7 +5,6 @@ use utils::lsn::Lsn;

 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Instant;

 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
@@ -89,17 +88,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        println!("ingested {file_cnt} files");
    }

-    for _ in 0..100 {
-        let start = Instant::now();
-        let files = mgmt_api_client
-            .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-            .await?;
-        println!(
-            "{} files found in {}s",
-            files.len(),
-            start.elapsed().as_secs_f64()
-        );
-    }
+    let files = mgmt_api_client
+        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+        .await?;
+
+    println!("{} files found", files.len());

    anyhow::Ok(())
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver_api::key::Key;
+use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;

@@ -187,7 +187,7 @@ async fn main_impl(
                    for r in partitioning.keys.ranges.iter() {
                        let mut i = r.start;
                        while i != r.end {
-                            if i.is_rel_block_key() {
+                            if is_rel_block_key(&i) {
                                filtered.add_key(i);
                            }
                            i = i.next();
@@ -308,10 +308,9 @@ async fn main_impl(
                    let r = &ranges[weights.sample(&mut rng)];
                    let key: i128 = rng.gen_range(r.start..r.end);
                    let key = Key::from_i128(key);
-                    assert!(key.is_rel_block_key());
-                    let (rel_tag, block_no) = key
-                        .to_rel_block()
-                        .expect("we filter non-rel-block keys out above");
+                    assert!(is_rel_block_key(&key));
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
                            Lsn::MAX
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -178,8 +178,7 @@ impl AuxFileSizeEstimator {
        }
    }

-    /// When generating base backup or doing initial logical size calculation
-    pub fn on_initial(&self, new_size: usize) {
+    pub fn on_base_backup(&self, new_size: usize) {
        let mut guard = self.size.lock().unwrap();
        *guard = Some(new_size as isize);
        self.report(new_size as isize);
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::Key;
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -170,7 +170,7 @@ where
    }

    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
-        let (kind, segno, _) = key.to_slru_block()?;
+        let (kind, segno, _) = key_to_slru_block(*key)?;

        match kind {
            SlruKind::Clog => {
@@ -362,13 +362,6 @@ where
                    ));
                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                } else if path == "pg_logical/replorigin_checkpoint" {
-                    // replorigin_checkoint is written only on compute shutdown, so it contains
-                    // deteriorated values. So we generate our own version of this file for the particular LSN
-                    // based on information about replorigins extracted from transaction commit records.
-                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                    // but now we should handle (skip) it for backward compatibility.
-                    continue;
                }
                let header = new_tar_header(&path, content.len() as u64)?;
                self.ar
@@ -397,32 +390,6 @@ where
        {
            self.add_twophase_file(xid).await?;
        }
-        let repl_origins = self
-            .timeline
-            .get_replorigins(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
-        let n_origins = repl_origins.len();
-        if n_origins != 0 {
-            //
-            // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins
-            // extracted from transaction commit record. We are using this file to pass information about replication
-            // origins to compute to allow logical replication to restart from proper point.
-            //
-            let mut content = Vec::with_capacity(n_origins * 16 + 8);
-            content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
-            for (origin_id, origin_lsn) in repl_origins {
-                content.extend_from_slice(&origin_id.to_le_bytes());
-                content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
-                content.extend_from_slice(&origin_lsn.0.to_le_bytes());
-            }
-            let crc32 = crc32c::crc32c(&content);
-            content.extend_from_slice(&crc32.to_le_bytes());
-            let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
-            self.ar.append(&header, &*content).await.context(
-                "could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
-            )?;
-        }

        fail_point!("basebackup-before-control-file", |_| {
            Err(BasebackupError::Server(anyhow!(
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,6 +99,8 @@ pub mod defaults {

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -144,6 +146,8 @@ pub mod defaults {

 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'

+#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -296,6 +300,8 @@ pub struct PageServerConf {
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub walredo_process_kind: crate::walredo::ProcessKind,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +407,8 @@ struct PageServerConfigBuilder {
    validate_vectored_get: BuilderValue<bool>,

    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }

 impl PageServerConfigBuilder {
@@ -489,6 +497,8 @@ impl PageServerConfigBuilder {
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+
+            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
        }
    }
 }
@@ -676,6 +686,10 @@ impl PageServerConfigBuilder {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }

+    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
+        self.walredo_process_kind = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -733,6 +747,7 @@ impl PageServerConfigBuilder {
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
+                walredo_process_kind,
            }
            CUSTOM LOGIC
            {
@@ -1029,6 +1044,9 @@ impl PageServerConf {
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
+                "walredo_process_kind" => {
+                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1112,6 +1130,7 @@ impl PageServerConf {
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
        }
    }
 }
@@ -1351,6 +1370,7 @@ background_task_maximum_delay = '334 s'
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1424,6 +1444,7 @@ background_task_maximum_delay = '334 s'
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // mean the synthetic size worker should terminate.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled)
+        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
    );

    if !shutting_down {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -311,7 +311,7 @@ impl DeletionList {
                result.extend(
                    timeline_layers
                        .into_iter()
-                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
                );
            }
        }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,7 +74,6 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
-use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
@@ -184,6 +183,9 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
+            PageReconstructError::AncestorStopping(_) => {
+                ApiError::ResourceUnavailable(format!("{pre}").into())
+            }
            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
@@ -1811,22 +1813,11 @@ async fn timeline_checkpoint_handler(
        timeline
            .freeze_and_flush()
            .await
-            .map_err(|e| {
-                match e {
-                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
-                    other => ApiError::InternalServerError(other.into()),
-
-                }
-            })?;
+            .map_err(ApiError::InternalServerError)?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
-            .map_err(|e|
-                match e {
-                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                    CompactionError::Other(e) => ApiError::InternalServerError(e)
-                }
-            )?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        if wait_until_uploaded {
            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
@@ -2182,7 +2173,7 @@ async fn tenant_scan_remote_handler(
            {
                Ok((index_part, index_generation)) => {
                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
+                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
                    generation = std::cmp::max(generation, index_generation);
                }
                Err(DownloadError::NotFound) => {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -66,7 +66,6 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -373,7 +372,7 @@ impl From<WaitLsnError> for PageStreamError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
            WaitLsnError::Shutdown => Self::Shutdown,
-            e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
+            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
        }
    }
 }
@@ -383,7 +382,7 @@ impl From<WaitLsnError> for QueryError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState { .. } => Self::Reconnect,
+            WaitLsnError::BadState => Self::Reconnect,
        }
    }
 }
@@ -831,10 +830,7 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
+        timeline.freeze_and_flush().await?;

        info!("done");
        Ok(())
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -17,8 +17,8 @@ use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
 use pageserver_api::key::{
-    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
+    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
+    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
@@ -27,7 +27,7 @@ use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
-use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
+use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
@@ -78,19 +78,11 @@ pub enum LsnForTimestamp {
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum CalculateLogicalSizeError {
+pub enum CalculateLogicalSizeError {
    #[error("cancelled")]
    Cancelled,
-
-    /// Something went wrong while reading the metadata we use to calculate logical size
-    /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
-    /// in the `From` implementation for this variant.
    #[error(transparent)]
-    PageRead(PageReconstructError),
-
-    /// Something went wrong deserializing metadata that we read to calculate logical size
-    #[error("decode error: {0}")]
-    Decode(#[from] DeserializeError),
+    Other(#[from] anyhow::Error),
 }

 #[derive(Debug, thiserror::Error)]
@@ -115,8 +107,10 @@ impl From<PageReconstructError> for CollectKeySpaceError {
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
-            PageReconstructError::Cancelled => Self::Cancelled,
-            _ => Self::PageRead(pre),
+            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
+                Self::Cancelled
+            }
+            _ => Self::Other(pre.into()),
        }
    }
 }
@@ -718,22 +712,10 @@ impl Timeline {
                result.insert(fname, content);
            }
        }
-        self.aux_file_size_estimator.on_initial(sz);
+        self.aux_file_size_estimator.on_base_backup(sz);
        Ok(result)
    }

-    pub(crate) async fn trigger_aux_file_size_computation(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
-            self.list_aux_files_v2(lsn, ctx).await?;
-        }
-        Ok(())
-    }
-
    pub(crate) async fn list_aux_files(
        &self,
        lsn: Lsn,
@@ -772,27 +754,6 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_replorigins(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
-        let kv = self
-            .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
-        let mut result = HashMap::new();
-        for (k, v) in kv {
-            let v = v.context("get value")?;
-            let origin_id = k.field6 as RepOriginId;
-            let origin_lsn = Lsn::des(&v).unwrap();
-            if origin_lsn != Lsn::INVALID {
-                result.insert(origin_id, origin_lsn);
-            }
-        }
-        Ok(result)
-    }
-
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -802,7 +763,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn get_current_logical_size_non_incremental(
+    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -811,7 +772,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
@@ -918,9 +879,7 @@ impl Timeline {
        Ok((
            result.to_keyspace(),
            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
+            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
        ))
    }

@@ -1189,20 +1148,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub async fn set_replorigin(
-        &mut self,
-        origin_id: RepOriginId,
-        origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
-        let key = repl_origin_key(origin_id);
-        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
-        Ok(())
-    }
-
-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
-        self.set_replorigin(origin_id, Lsn::INVALID).await
-    }
-
    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
        self.put(CONTROLFILE_KEY, Value::Image(img));
        Ok(())
@@ -1607,7 +1552,7 @@ impl<'a> DatadirModification<'a> {
                    self.tline.aux_file_size_estimator.on_add(content.len());
                    new_files.push((path, content));
                }
-                (None, true) => warn!("removing non-existing aux file: {}", path),
+                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
            }
            let new_val = aux_file::encode_file_value(&new_files)?;
            self.put(key, Value::Image(new_val.into()));
@@ -1661,7 +1606,8 @@ impl<'a> DatadirModification<'a> {
                        aux_files.dir = Some(dir);
                    }
                    Err(
-                        e @ (PageReconstructError::Cancelled
+                        e @ (PageReconstructError::AncestorStopping(_)
+                        | PageReconstructError::Cancelled
                        | PageReconstructError::AncestorLsnTimeout(_)),
                    ) => {
                        // Important that we do not interpret a shutdown error as "not found" and thereby
@@ -1733,7 +1679,7 @@ impl<'a> DatadirModification<'a> {
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
            for (lsn, value) in values {
-                if key.is_rel_block_key() || key.is_slru_block_key() {
+                if is_rel_block_key(&key) || is_slru_block_key(key) {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
                    writer.put(key, lsn, &value, ctx).await?;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -487,33 +487,6 @@ enum CreateTimelineCause {
    Delete,
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum GcError {
-    // The tenant is shutting down
-    #[error("tenant shutting down")]
-    TenantCancelled,
-
-    // The tenant is shutting down
-    #[error("timeline shutting down")]
-    TimelineCancelled,
-
-    // The tenant is in a state inelegible to run GC
-    #[error("not active")]
-    NotActive,
-
-    // A requested GC cutoff LSN was invalid, for example it tried to move backwards
-    #[error("not active")]
-    BadLsn { why: String },
-
-    // A remote storage error while scheduling updates after compaction
-    #[error(transparent)]
-    Remote(anyhow::Error),
-
-    // If GC was invoked for a particular timeline, this error means it didn't exist
-    #[error("timeline not found")]
-    TimelineNotFound,
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -1420,36 +1393,6 @@ impl Tenant {
        Ok(tl)
    }

-    /// Helper for unit tests to create a timeline with some pre-loaded states.
-    #[cfg(test)]
-    #[allow(clippy::too_many_arguments)]
-    pub async fn create_test_timeline_with_layers(
-        &self,
-        new_timeline_id: TimelineId,
-        initdb_lsn: Lsn,
-        pg_version: u32,
-        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
-        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
-        end_lsn: Lsn,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let tline = self
-            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
-            .await?;
-        tline.force_advance_lsn(end_lsn);
-        for deltas in delta_layer_desc {
-            tline
-                .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
-                .await?;
-        }
-        for (lsn, images) in image_layer_desc {
-            tline
-                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
-                .await?;
-        }
-        Ok(tline)
-    }
-
    /// Create a new timeline.
    ///
    /// Returns the new timeline ID and reference to its Timeline object.
@@ -1564,7 +1507,7 @@ impl Tenant {
                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                        .await
                        .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
                            }
                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
@@ -1632,23 +1575,24 @@ impl Tenant {
    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
    /// requires more history to be retained.
    //
-    pub(crate) async fn gc_iteration(
+    pub async fn gc_iteration(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<GcResult, GcError> {
+    ) -> anyhow::Result<GcResult> {
        // Don't start doing work during shutdown
        if let TenantState::Stopping { .. } = self.current_state() {
            return Ok(GcResult::default());
        }

        // there is a global allowed_error for this
-        if !self.is_active() {
-            return Err(GcError::NotActive);
-        }
+        anyhow::ensure!(
+            self.is_active(),
+            "Cannot run GC iteration on inactive tenant"
+        );

        {
            let conf = self.tenant_conf.load();
@@ -2816,13 +2760,28 @@ impl Tenant {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<GcResult, GcError> {
+    ) -> anyhow::Result<GcResult> {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

-        let gc_timelines = self
+        let gc_timelines = match self
            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
-            .await?;
+            .await
+        {
+            Ok(result) => result,
+            Err(e) => {
+                if let Some(PageReconstructError::Cancelled) =
+                    e.downcast_ref::<PageReconstructError>()
+                {
+                    // Handle cancellation
+                    totals.elapsed = now.elapsed();
+                    return Ok(totals);
+                } else {
+                    // Propagate other errors
+                    return Err(e);
+                }
+            }
+        };

        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");

@@ -2847,19 +2806,7 @@ impl Tenant {
                // made.
                break;
            }
-            let result = match timeline.gc().await {
-                Err(GcError::TimelineCancelled) => {
-                    if target_timeline_id.is_some() {
-                        // If we were targetting this specific timeline, surface cancellation to caller
-                        return Err(GcError::TimelineCancelled);
-                    } else {
-                        // A timeline may be shutting down independently of the tenant's lifecycle: we should
-                        // skip past this and proceed to try GC on other timelines.
-                        continue;
-                    }
-                }
-                r => r?,
-            };
+            let result = timeline.gc().await?;
            totals += result;
        }

@@ -2872,11 +2819,11 @@ impl Tenant {
    /// [`Tenant::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub(crate) async fn refresh_gc_info(
+    pub async fn refresh_gc_info(
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<Vec<Arc<Timeline>>, GcError> {
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // since this method can now be called at different rates than the configured gc loop, it
        // might be that these configuration values get applied faster than what it was previously,
        // since these were only read from the gc task.
@@ -2897,7 +2844,7 @@ impl Tenant {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<Vec<Arc<Timeline>>, GcError> {
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
        // currently visible timelines.
        let timelines = self
@@ -2934,8 +2881,8 @@ impl Tenant {
            }
        }

-        if !self.is_active() || self.cancel.is_cancelled() {
-            return Err(GcError::TenantCancelled);
+        if !self.is_active() {
+            anyhow::bail!("shutting down");
        }

        // grab mutex to prevent new timelines from being created here; avoid doing long operations
@@ -2944,19 +2891,19 @@ impl Tenant {

        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
-        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
+        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
            let timelines = self.timelines.lock().unwrap();
            let mut all_branchpoints = BTreeSet::new();
-            let timelines = {
+            let timeline_ids = {
                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
                    if timelines.get(target_timeline_id).is_none() {
-                        return Err(GcError::TimelineNotFound);
+                        bail!("gc target timeline does not exist")
                    }
                };

                timelines
                    .iter()
-                    .map(|(_timeline_id, timeline_entry)| {
+                    .map(|(timeline_id, timeline_entry)| {
                        if let Some(ancestor_timeline_id) =
                            &timeline_entry.get_ancestor_timeline_id()
                        {
@@ -2978,28 +2925,33 @@ impl Tenant {
                            }
                        }

-                        timeline_entry.clone()
+                        *timeline_id
                    })
                    .collect::<Vec<_>>()
            };
-            (all_branchpoints, timelines)
+            (all_branchpoints, timeline_ids)
        };

        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
-        let mut gc_timelines = Vec::with_capacity(timelines.len());
-        for timeline in timelines {
+        let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
+        for timeline_id in timeline_ids {
+            // Timeline is known to be local and loaded.
+            let timeline = self
+                .get_timeline(timeline_id, false)
+                .with_context(|| format!("Timeline {timeline_id} was not found"))?;
+
            // If target_timeline is specified, ignore all other timelines
            if let Some(target_timeline_id) = target_timeline_id {
-                if timeline.timeline_id != target_timeline_id {
+                if timeline_id != target_timeline_id {
                    continue;
                }
            }

            let branchpoints: Vec<Lsn> = all_branchpoints
                .range((
-                    Included((timeline.timeline_id, Lsn(0))),
-                    Included((timeline.timeline_id, Lsn(u64::MAX))),
+                    Included((timeline_id, Lsn(0))),
+                    Included((timeline_id, Lsn(u64::MAX))),
                ))
                .map(|&x| x.1)
                .collect();
@@ -3007,7 +2959,7 @@ impl Tenant {
            {
                let mut target = timeline.gc_info.write().unwrap();

-                match gc_cutoffs.remove(&timeline.timeline_id) {
+                match gc_cutoffs.remove(&timeline_id) {
                    Some(cutoffs) => {
                        *target = GcInfo {
                            retain_lsns: branchpoints,
@@ -3040,53 +2992,17 @@ impl Tenant {
        &self,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
-        ancestor_lsn: Option<Lsn>,
+        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
        let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
            .await?;
        tl.set_state(TimelineState::Active);
        Ok(tl)
    }

-    /// Helper for unit tests to branch a timeline with some pre-loaded states.
-    #[cfg(test)]
-    #[allow(clippy::too_many_arguments)]
-    pub async fn branch_timeline_test_with_layers(
-        &self,
-        src_timeline: &Arc<Timeline>,
-        dst_id: TimelineId,
-        ancestor_lsn: Option<Lsn>,
-        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
-        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
-        end_lsn: Lsn,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let tline = self
-            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
-            .await?;
-        let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
-            ancestor_lsn
-        } else {
-            tline.get_last_record_lsn()
-        };
-        assert!(end_lsn >= ancestor_lsn);
-        tline.force_advance_lsn(end_lsn);
-        for deltas in delta_layer_desc {
-            tline
-                .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
-                .await?;
-        }
-        for (lsn, images) in image_layer_desc {
-            tline
-                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
-                .await?;
-        }
-        Ok(tline)
-    }
-
    /// Branch an existing timeline.
    ///
    /// The caller is responsible for activating the returned timeline.
@@ -3865,9 +3781,6 @@ pub(crate) mod harness {
        pub fn create_custom(
            test_name: &'static str,
            tenant_conf: TenantConf,
-            tenant_id: TenantId,
-            shard_identity: ShardIdentity,
-            generation: Generation,
        ) -> anyhow::Result<Self> {
            setup_logging();

@@ -3880,12 +3793,8 @@ pub(crate) mod harness {
            // OK in a test.
            let conf: &'static PageServerConf = Box::leak(Box::new(conf));

-            let shard = shard_identity.shard_index();
-            let tenant_shard_id = TenantShardId {
-                tenant_id,
-                shard_number: shard.shard_number,
-                shard_count: shard.shard_count,
-            };
+            let tenant_id = TenantId::generate();
+            let tenant_shard_id = TenantShardId::unsharded(tenant_id);
            fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
            fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?;

@@ -3903,8 +3812,8 @@ pub(crate) mod harness {
                conf,
                tenant_conf,
                tenant_shard_id,
-                generation,
-                shard,
+                generation: Generation::new(0xdeadbeef),
+                shard: ShardIndex::unsharded(),
                remote_storage,
                remote_fs_dir,
                deletion_queue,
@@ -3919,15 +3828,8 @@ pub(crate) mod harness {
                compaction_period: Duration::ZERO,
                ..TenantConf::default()
            };
-            let tenant_id = TenantId::generate();
-            let shard = ShardIdentity::unsharded();
-            Self::create_custom(
-                test_name,
-                tenant_conf,
-                tenant_id,
-                shard,
-                Generation::new(0xdeadbeef),
-            )
+
+            Self::create_custom(test_name, tenant_conf)
        }

        pub fn span(&self) -> tracing::Span {
@@ -4006,8 +3908,8 @@ pub(crate) mod harness {
                let base_img = base_img.expect("Neon WAL redo requires base image").1;
                let mut page = BytesMut::new();
                page.extend_from_slice(&base_img);
-                for (record_lsn, record) in records {
-                    apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
+                for (_record_lsn, record) in records {
+                    apply_neon::apply_in_neon(&record, key, &mut page)?;
                }
                Ok(page.freeze())
            } else {
@@ -4051,7 +3953,6 @@ mod tests {
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
    use utils::bin_ser::BeSer;
-    use utils::id::TenantId;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4253,7 +4154,7 @@ mod tests {
                .await?;
            writer.finish_write(lsn);
        }
-        tline.freeze_and_flush().await.map_err(|e| e.into())
+        tline.freeze_and_flush().await
    }

    #[tokio::test]
@@ -4407,10 +4308,9 @@ mod tests {

        // This needs to traverse to the parent, and fails.
        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err.to_string().starts_with(&format!(
-            "Bad state on timeline {}: Broken",
-            tline.timeline_id
-        )));
+        assert!(err
+            .to_string()
+            .contains("will not become active. Current state: Broken"));

        Ok(())
    }
@@ -4951,13 +4851,7 @@ mod tests {
            ..TenantConf::default()
        };

-        let harness = TenantHarness::create_custom(
-            "test_get_vectored_key_gap",
-            tenant_conf,
-            TenantId::generate(),
-            ShardIdentity::unsharded(),
-            Generation::new(0xdeadbeef),
-        )?;
+        let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
        let (tenant, ctx) = harness.load().await;

        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -6311,36 +6205,75 @@ mod tests {
    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();

        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();

-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
-                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
-            )
-            .await?;
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create a image layer
+        }

        let child = tenant
-            .branch_timeline_test_with_layers(
-                &tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x20)),
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
-                Lsn(0x30),
-            )
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
            .await
            .unwrap();

+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("data key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?; // this will create a delta
+
+            {
+                // update the partitioning to include the test key space, otherwise they
+                // will be dropped by image layer creation
+                let mut guard = child.partitioning.lock().await;
+                let ((partitioning, _), partition_lsn) = &mut *guard;
+                partitioning
+                    .parts
+                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
+                *partition_lsn = lsn;
+            }
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
+        }
+
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
@@ -6362,8 +6295,6 @@ mod tests {
            }))
        }

-        let lsn = Lsn(0x30);
-
        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6401,42 +6332,94 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
        let (tenant, ctx) = harness.load().await;
-
-        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
-        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
-        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
-
        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
-                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
-            )
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        base_key_child.field1 = AUX_KEY_PREFIX;
+        base_key_nonexist.field1 = AUX_KEY_PREFIX;
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    base_key,
+                    lsn,
+                    &Value::Image(test_img("metadata key 1")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create an image layer
+
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
        let child = tenant
-            .branch_timeline_test_with_layers(
-                &tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x20)),
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(
-                    Lsn(0x30),
-                    vec![(base_key_child, test_img("metadata key 2"))],
-                )], // image layers
-                Lsn(0x30),
-            )
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
            .await
            .unwrap();

+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("metadata key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?;
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
@@ -6458,8 +6441,6 @@ mod tests {
            }))
        }

-        let lsn = Lsn(0x30);
-
        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6490,208 +6471,4 @@ mod tests {

        Ok(())
    }
-
-    async fn get_vectored_impl_wrapper(
-        tline: &Arc<Timeline>,
-        key: Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Option<Bytes>, GetVectoredError> {
-        let mut reconstruct_state = ValuesReconstructState::new();
-        let mut res = tline
-            .get_vectored_impl(
-                KeySpace::single(key..key.next()),
-                lsn,
-                &mut reconstruct_state,
-                ctx,
-            )
-            .await?;
-        Ok(res.pop_last().map(|(k, v)| {
-            assert_eq!(k, key);
-            v.unwrap()
-        }))
-    }
-
-    #[tokio::test]
-    async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
-        let (tenant, ctx) = harness.load().await;
-        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
-        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
-        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
-        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
-
-        // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
-        // Lsn 0x30 key0, key3, no key1+key2
-        // Lsn 0x20 key1+key2 tomestones
-        // Lsn 0x10 key1 in image, key2 in delta
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                // delta layers
-                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                ],
-                // image layers
-                vec![
-                    (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
-                    (
-                        Lsn(0x30),
-                        vec![
-                            (key0, test_img("metadata key 0")),
-                            (key3, test_img("metadata key 3")),
-                        ],
-                    ),
-                ],
-                Lsn(0x30),
-            )
-            .await?;
-
-        let lsn = Lsn(0x30);
-        let old_lsn = Lsn(0x20);
-
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
-            Some(test_img("metadata key 0"))
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
-            None,
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
-            None,
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
-            Some(Bytes::new()),
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
-            Some(Bytes::new()),
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
-            Some(test_img("metadata key 3"))
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
-        let (tenant, ctx) = harness.load().await;
-
-        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
-        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
-        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
-        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                // delta layers
-                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![
-                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
-                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
-                    ],
-                ],
-                // image layers
-                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x30),
-            )
-            .await?;
-
-        let cancel = CancellationToken::new();
-
-        tline
-            .compact(
-                &cancel,
-                {
-                    let mut flags = EnumSet::new();
-                    flags.insert(CompactFlags::ForceImageLayerCreation);
-                    flags.insert(CompactFlags::ForceRepartition);
-                    flags
-                },
-                &ctx,
-            )
-            .await?;
-
-        // Image layers are created at last_record_lsn
-        let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
-            .into_iter()
-            .filter(|(k, _)| k.is_metadata_key())
-            .collect::<Vec<_>>();
-        assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
-        let (tenant, ctx) = harness.load().await;
-
-        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
-        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                // delta layers
-                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                ],
-                // image layers
-                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x30),
-            )
-            .await?;
-
-        let cancel = CancellationToken::new();
-
-        tline
-            .compact(
-                &cancel,
-                {
-                    let mut flags = EnumSet::new();
-                    flags.insert(CompactFlags::ForceImageLayerCreation);
-                    flags.insert(CompactFlags::ForceRepartition);
-                    flags
-                },
-                &ctx,
-            )
-            .await?;
-
-        // Image layers are created at last_record_lsn
-        let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
-            .into_iter()
-            .filter(|(k, _)| k.is_metadata_key())
-            .collect::<Vec<_>>();
-        assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
-
-        Ok(())
-    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -16,7 +16,6 @@ use crate::{
    task_mgr::{self, TaskKind},
    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
-        remote_timeline_client::remote_heatmap_path,
        timeline::ShutdownMode,
    },
 };
@@ -532,25 +531,6 @@ impl DeleteTenantFlow {
            }
        }

-        // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
-        let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
-        if let Some(Err(e)) = backoff::retry(
-            || async {
-                remote_storage
-                    .delete(&heatmap_path, &task_mgr::shutdown_token())
-                    .await
-            },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_remote_tenant_heatmap",
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        {
-            tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
-        }
-
        let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
        if timelines_path.exists() {
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -267,7 +267,7 @@ impl<'de> Deserialize<'de> for TimelineMetadata {
        D: serde::Deserializer<'de>,
    {
        let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
+        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
    }
 }

@@ -276,163 +276,13 @@ impl Serialize for TimelineMetadata {
    where
        S: Serializer,
    {
-        let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
+        let bytes = self
+            .to_bytes()
+            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
        bytes.serialize(serializer)
    }
 }

-pub(crate) mod modern_serde {
-    use crate::tenant::metadata::METADATA_FORMAT_VERSION;
-
-    use super::{
-        TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
-    };
-    use serde::{Deserialize, Serialize};
-
-    pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
-    where
-        D: serde::de::Deserializer<'de>,
-    {
-        // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec<u8> field with
-        // BeSer.
-        struct Visitor;
-
-        impl<'d> serde::de::Visitor<'d> for Visitor {
-            type Value = TimelineMetadata;
-
-            fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-                f.write_str("BeSer bytes or json structure")
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'d>,
-            {
-                use serde::de::Error;
-                let de = serde::de::value::SeqAccessDeserializer::new(seq);
-                Vec::<u8>::deserialize(de)
-                    .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))?
-            }
-
-            fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::MapAccess<'d>,
-            {
-                use serde::de::Error;
-
-                let de = serde::de::value::MapAccessDeserializer::new(map);
-                let body = TimelineMetadataBodyV2::deserialize(de)?;
-
-                // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
-                // across serialization versions
-                let mut sink = Crc32Sink::default();
-                <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
-                    .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
-
-                let size = METADATA_HDR_SIZE + sink.count;
-
-                Ok(TimelineMetadata {
-                    hdr: TimelineMetadataHeader {
-                        checksum: sink.crc,
-                        size: size as u16,
-                        format_version: METADATA_FORMAT_VERSION,
-                    },
-                    body,
-                })
-            }
-        }
-
-        deserializer.deserialize_any(Visitor)
-    }
-
-    #[derive(Default)]
-    struct Crc32Sink {
-        crc: u32,
-        count: usize,
-    }
-
-    impl std::io::Write for Crc32Sink {
-        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-            self.crc = crc32c::crc32c_append(self.crc, buf);
-            self.count += buf.len();
-            Ok(buf.len())
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            Ok(())
-        }
-    }
-
-    #[derive(thiserror::Error)]
-    #[error("re-serializing for crc32 failed")]
-    struct Crc32CalculationFailed<E>(#[source] E);
-
-    // this should be true for one release, after that we can change it to false
-    // remember to check the IndexPart::metadata field TODO comment as well
-    const LEGACY_BINCODED_BYTES: bool = true;
-
-    #[derive(serde::Serialize)]
-    #[serde(transparent)]
-    struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
-
-    struct JustTheBodyV2<'a>(&'a TimelineMetadata);
-
-    impl serde::Serialize for JustTheBodyV2<'_> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            // header is not needed, upon reading we've upgraded all v1 to v2
-            self.0.body.serialize(serializer)
-        }
-    }
-
-    pub(crate) fn serialize<S>(
-        metadata: &TimelineMetadata,
-        serializer: S,
-    ) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        // we cannot use TimelineMetadata::serialize for now because it'll do
-        // TimelineMetadata::to_bytes
-        if LEGACY_BINCODED_BYTES {
-            LegacyPaddedBytes(metadata).serialize(serializer)
-        } else {
-            JustTheBodyV2(metadata).serialize(serializer)
-        }
-    }
-
-    #[test]
-    fn deserializes_bytes_as_well_as_equivalent_body_v2() {
-        #[derive(serde::Deserialize, serde::Serialize)]
-        struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
-
-        let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
-
-        let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
-
-        let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
-
-        assert_eq!(
-            serialized,
-            serde_json::json! {{
-                "disk_consistent_lsn": "0/149FD90",
-                "prev_record_lsn": "0/149FD18",
-                "ancestor_timeline": null,
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/149FD18",
-                "initdb_lsn": "0/149FD18",
-                "pg_version": 15
-            }}
-        );
-
-        let wrapper_from_json = serde_json::value::from_value::<Wrapper>(serialized).unwrap();
-
-        assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0);
-    }
-}
-
 /// Parts of the metadata which are regularly modified.
 pub(crate) struct MetadataUpdate {
    disk_consistent_lsn: Lsn,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -2833,13 +2833,7 @@ pub(crate) async fn immediate_gc(
        }
    }

-    result.map_err(|e| match e {
-        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
-        GcError::TimelineNotFound => {
-            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
-        }
-        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-    })
+    result.map_err(ApiError::InternalServerError)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -91,7 +91,8 @@
 //!
 //! The *actual* remote state lags behind the *desired* remote state while
 //! there are in-flight operations.
-//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`].
+//! We keep track of the desired remote state in
+//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`].
 //! It is initialized based on the [`IndexPart`] that was passed during init
 //! and updated with every `schedule_*` function call.
 //! All this is necessary necessary to compute the future [`IndexPart`]s
@@ -114,7 +115,8 @@
 //!
 //! # Completion
 //!
-//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately,
+//! Once an operation has completed, we update
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
 //! and submit a request through the DeletionQueue to update
 //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
 //! validated that our generation is not stale.  It is this visible value
@@ -414,7 +416,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
@@ -441,11 +442,13 @@ impl RemoteTimelineClient {
    /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
    /// client is currently initialized.
    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // technically this is a dirty read, but given how timeline detach ancestor is implemented
+        // via tenant restart, the lineage has always been uploaded.
        self.upload_queue
            .lock()
            .unwrap()
            .initialized_mut()
-            .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn))
+            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
            .unwrap_or(false)
    }

@@ -454,6 +457,7 @@ impl RemoteTimelineClient {
            current_remote_index_part
                .layer_metadata
                .values()
+                // If we don't have the file size for the layer, don't account for it in the metric.
                .map(|ilmd| ilmd.file_size)
                .sum()
        } else {
@@ -581,9 +585,9 @@ impl RemoteTimelineClient {

        // As documented in the struct definition, it's ok for latest_metadata to be
        // ahead of what's _actually_ on the remote during index upload.
-        upload_queue.dirty.metadata = metadata.clone();
+        upload_queue.latest_metadata = metadata.clone();

-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);

        Ok(())
    }
@@ -602,9 +606,9 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        upload_queue.dirty.metadata.apply(update);
+        upload_queue.latest_metadata.apply(update);

-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);

        Ok(())
    }
@@ -616,8 +620,8 @@ impl RemoteTimelineClient {
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
-        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue)?;
+        upload_queue.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue);
        Ok(())
    }
    ///
@@ -635,44 +639,30 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
        }

        Ok(())
    }

    /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-    ) -> anyhow::Result<()> {
-        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
-        // fix up the duplicated field
-        upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
-
-        // make sure it serializes before doing it in perform_upload_task so that it doesn't
-        // look like a retryable error
-        let void = std::io::sink();
-        serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
-
-        let index_part = &upload_queue.dirty;
+    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();

        info!(
            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
-            index_part.layer_metadata.len(),
+            upload_queue.latest_files.len(),
            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
        );

-        let op = UploadOp::UploadMetadata {
-            uploaded: Box::new(index_part.clone()),
-        };
+        let index_part = IndexPart::from(&*upload_queue);
+        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
-        Ok(())
    }

    pub(crate) async fn schedule_reparenting_and_wait(
@@ -685,16 +675,16 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else {
+            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
                return Err(anyhow::anyhow!(
                    "cannot reparent without a current ancestor"
                ));
            };

-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            upload_queue.latest_metadata.reparent(new_parent);
+            upload_queue.latest_lineage.record_previous_ancestor(&prev);

-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);

            self.schedule_barrier0(upload_queue)
        };
@@ -715,17 +705,16 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
+            upload_queue.latest_lineage.record_detaching(&adopted);

            for layer in layers {
                upload_queue
-                    .dirty
-                    .layer_metadata
+                    .latest_files
                    .insert(layer.layer_desc().layer_name(), layer.metadata());
            }

-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);

            let barrier = self.schedule_barrier0(upload_queue);
            self.launch_queued_tasks(upload_queue);
@@ -757,8 +746,7 @@ impl RemoteTimelineClient {
        let metadata = layer.metadata();

        upload_queue
-            .dirty
-            .layer_metadata
+            .latest_files
            .insert(layer.layer_desc().layer_name(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

@@ -788,8 +776,8 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let with_metadata = self
-            .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
+        let with_metadata =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());

        self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);

@@ -813,7 +801,7 @@ impl RemoteTimelineClient {

        let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());

-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);

        self.launch_queued_tasks(upload_queue);

@@ -826,7 +814,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
-    ) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
+    ) -> Vec<(LayerName, LayerFileMetadata)>
    where
        I: IntoIterator<Item = LayerName>,
    {
@@ -836,7 +824,7 @@ impl RemoteTimelineClient {
        let with_metadata: Vec<_> = names
            .into_iter()
            .filter_map(|name| {
-                let meta = upload_queue.dirty.layer_metadata.remove(&name);
+                let meta = upload_queue.latest_files.remove(&name);

                if let Some(meta) = meta {
                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -868,10 +856,10 @@ impl RemoteTimelineClient {
        // index_part update, because that needs to be uploaded before we can actually delete the
        // files.
        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
        }

-        Ok(with_metadata)
+        with_metadata
    }

    /// Schedules deletion for layer files which have previously been unlinked from the
@@ -962,7 +950,7 @@ impl RemoteTimelineClient {

        let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());

-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
        self.launch_queued_tasks(upload_queue);

        Ok(())
@@ -1097,7 +1085,7 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);

-            let mut index_part = stopped.upload_queue_for_deletion.dirty.clone();
+            let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
            index_part.deleted_at = Some(deleted_at);
            index_part
        };
@@ -1308,8 +1296,7 @@ impl RemoteTimelineClient {

            stopped
                .upload_queue_for_deletion
-                .dirty
-                .layer_metadata
+                .latest_files
                .drain()
                .map(|(file_name, meta)| {
                    remote_layer_path(
@@ -1446,7 +1433,7 @@ impl RemoteTimelineClient {
                    // Can always be scheduled.
                    true
                }
-                UploadOp::UploadMetadata { .. } => {
+                UploadOp::UploadMetadata(_, _) => {
                    // These can only be performed after all the preceding operations
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
@@ -1488,7 +1475,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadLayer(_, _) => {
                    upload_queue.num_inprogress_layer_uploads += 1;
                }
-                UploadOp::UploadMetadata { .. } => {
+                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
                UploadOp::Delete(_) => {
@@ -1597,13 +1584,22 @@ impl RemoteTimelineClient {
                    )
                    .await
                }
-                UploadOp::UploadMetadata { ref uploaded } => {
+                UploadOp::UploadMetadata(ref index_part, _lsn) => {
+                    let mention_having_future_layers = if cfg!(feature = "testing") {
+                        index_part
+                            .layer_metadata
+                            .keys()
+                            .any(|x| x.is_in_future(*_lsn))
+                    } else {
+                        false
+                    };
+
                    let res = upload::upload_index_part(
                        &self.storage_impl,
                        &self.tenant_shard_id,
                        &self.timeline_id,
                        self.generation,
-                        uploaded,
+                        index_part,
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1613,21 +1609,10 @@ impl RemoteTimelineClient {
                    )
                    .await;
                    if res.is_ok() {
-                        self.update_remote_physical_size_gauge(Some(uploaded));
-                        let mention_having_future_layers = if cfg!(feature = "testing") {
-                            uploaded
-                                .layer_metadata
-                                .keys()
-                                .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn()))
-                        } else {
-                            false
-                        };
+                        self.update_remote_physical_size_gauge(Some(index_part));
                        if mention_having_future_layers {
                            // find rationale near crate::tenant::timeline::init::cleanup_future_layer
-                            tracing::info!(
-                                disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(),
-                                "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"
-                            );
+                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
                        }
                    }
                    res
@@ -1728,23 +1713,11 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_layer_uploads -= 1;
                    None
                }
-                UploadOp::UploadMetadata { ref uploaded } => {
+                UploadOp::UploadMetadata(_, lsn) => {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
+                    // XXX monotonicity check?

-                    // the task id is reused as a monotonicity check for storing the "clean"
-                    // IndexPart.
-                    let last_updater = upload_queue.clean.1;
-                    let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
-                    let monotone = is_later || last_updater.is_none();
-
-                    assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
-
-                    // not taking ownership is wasteful
-                    upload_queue.clean.0.clone_from(uploaded);
-                    upload_queue.clean.1 = Some(task.task_id);
-
-                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
-
+                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -1798,7 +1771,7 @@ impl RemoteTimelineClient {
                RemoteOpKind::Upload,
                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
            ),
-            UploadOp::UploadMetadata { .. } => (
+            UploadOp::UploadMetadata(_, _) => (
                RemoteOpFileKind::Index,
                RemoteOpKind::Upload,
                DontTrackSize {
@@ -1874,9 +1847,11 @@ impl RemoteTimelineClient {
                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                    let upload_queue_for_deletion = UploadQueueInitialized {
                        task_counter: 0,
-                        dirty: initialized.dirty.clone(),
-                        clean: initialized.clean.clone(),
+                        latest_files: initialized.latest_files.clone(),
                        latest_files_changes_since_metadata_upload_scheduled: 0,
+                        latest_metadata: initialized.latest_metadata.clone(),
+                        latest_lineage: initialized.latest_lineage.clone(),
+                        projected_remote_consistent_lsn: None,
                        visible_remote_consistent_lsn: initialized
                            .visible_remote_consistent_lsn
                            .clone(),
@@ -1889,6 +1864,7 @@ impl RemoteTimelineClient {
                        dangling_files: HashMap::default(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        last_aux_file_policy: initialized.last_aux_file_policy,
                    };

                    let upload_queue = std::mem::replace(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -28,7 +28,6 @@ use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
-use utils::pausable_failpoint;

 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
@@ -153,8 +152,6 @@ async fn download_object<'a>(

                let download = storage.download(src_path, cancel).await?;

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
                let mut buf_writer =
                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);

@@ -202,8 +199,6 @@ async fn download_object<'a>(

                let mut download = storage.download(src_path, cancel).await?;

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                // There's chunks_vectored() on the stream.
                let (bytes_amount, destination_file) = async {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -11,6 +11,7 @@ use utils::id::TimelineId;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;

@@ -41,13 +42,9 @@ pub struct IndexPart {
    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
    // private because internally we would read from metadata instead.
-    pub(super) disk_consistent_lsn: Lsn,
+    disk_consistent_lsn: Lsn,

-    // TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
-    #[serde(
-        rename = "metadata_bytes",
-        with = "crate::tenant::metadata::modern_serde"
-    )]
+    #[serde(rename = "metadata_bytes")]
    pub metadata: TimelineMetadata,

    #[serde(default)]
@@ -83,15 +80,23 @@ impl IndexPart {

    pub const FILE_NAME: &'static str = "index_part.json";

-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
-        IndexPart {
+    fn new(
+        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
+        disk_consistent_lsn: Lsn,
+        metadata: TimelineMetadata,
+        lineage: Lineage,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> Self {
+        let layer_metadata = layers_and_metadata.clone();
+
+        Self {
            version: Self::LATEST_VERSION,
-            layer_metadata: Default::default(),
-            disk_consistent_lsn: metadata.disk_consistent_lsn(),
+            layer_metadata,
+            disk_consistent_lsn,
            metadata,
            deleted_at: None,
-            lineage: Default::default(),
-            last_aux_file_policy: None,
+            lineage,
+            last_aux_file_policy,
        }
    }

@@ -101,7 +106,7 @@ impl IndexPart {

    /// If you want this under normal operations, read it from self.metadata:
    /// this method is just for the scrubber to use when validating an index.
-    pub fn duplicated_disk_consistent_lsn(&self) -> Lsn {
+    pub fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn
    }

@@ -115,7 +120,14 @@ impl IndexPart {

    #[cfg(test)]
    pub(crate) fn example() -> Self {
-        Self::empty(TimelineMetadata::example())
+        let example_metadata = TimelineMetadata::example();
+        Self::new(
+            &HashMap::new(),
+            example_metadata.disk_consistent_lsn(),
+            example_metadata,
+            Default::default(),
+            Some(AuxFilePolicy::V1),
+        )
    }

    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
@@ -123,6 +135,22 @@ impl IndexPart {
    }
 }

+impl From<&UploadQueueInitialized> for IndexPart {
+    fn from(uq: &UploadQueueInitialized) -> Self {
+        let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
+        let metadata = uq.latest_metadata.clone();
+        let lineage = uq.latest_lineage.clone();
+
+        Self::new(
+            &uq.latest_files,
+            disk_consistent_lsn,
+            metadata,
+            lineage,
+            uq.last_aux_file_policy,
+        )
+    }
+}
+
 /// Metadata gathered for each of the layer files.
 ///
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -208,10 +236,11 @@ impl Lineage {
    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
    /// to start a read/write primary at this lsn".
    ///
-    /// Returns true if the Lsn was previously our branch point.
+    /// Returns true if the Lsn was previously a branch point.
    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
        self.original_ancestor
-            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
+            .as_ref()
+            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
    }
 }

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,7 +1,6 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use anyhow::{bail, Context};
-use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
@@ -12,10 +11,10 @@ use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};

-use super::index::IndexPart;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
-    remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
+    index::IndexPart, remote_index_path, remote_initdb_archive_path,
+    remote_initdb_preserved_archive_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -28,7 +27,7 @@ pub(crate) async fn upload_index_part<'a>(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    generation: Generation,
-    index_part: &IndexPart,
+    index_part: &'a IndexPart,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading new index part");
@@ -38,16 +37,16 @@ pub(crate) async fn upload_index_part<'a>(
    });
    pausable_failpoint!("before-upload-index-pausable");

-    // FIXME: this error comes too late
-    let serialized = index_part.to_s3_bytes()?;
-    let serialized = Bytes::from(serialized);
-
-    let index_part_size = serialized.len();
+    let index_part_bytes = index_part
+        .to_s3_bytes()
+        .context("serialize index part file into bytes")?;
+    let index_part_size = index_part_bytes.len();
+    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
    storage
        .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(serialized))),
+            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
            cancel,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1000,7 +1000,7 @@ impl<'a> TenantDownloader<'a> {
            layer.name,
            layer.metadata.file_size
        );
-        let downloaded_bytes = download_layer_file(
+        let downloaded_bytes = match download_layer_file(
            self.conf,
            self.remote_storage,
            *tenant_shard_id,
@@ -1011,9 +1011,8 @@ impl<'a> TenantDownloader<'a> {
            &self.secondary_state.cancel,
            ctx,
        )
-        .await;
-
-        let downloaded_bytes = match downloaded_bytes {
+        .await
+        {
            Ok(bytes) => bytes,
            Err(DownloadError::NotFound) => {
                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -334,11 +334,8 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!(
-                tenant_id=%tenant_shard_id.tenant_id,
-                shard_id=%tenant_shard_id.shard_slug(),
-                "Command already running, waiting for it"
-            );
+            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                           "Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: Vec<KeySpace>,
+    target_keyspace: KeySpace,
 }

 impl LayerFringe {
@@ -336,7 +336,6 @@ impl LayerFringe {
        };

        let removed = self.layers.remove_entry(&read_desc.layer_id);
-
        match removed {
            Some((
                _,
@@ -344,15 +343,7 @@ impl LayerFringe {
                    layer,
                    target_keyspace,
                },
-            )) => {
-                let mut keyspace = KeySpaceRandomAccum::new();
-                for ks in target_keyspace {
-                    for part in ks.ranges {
-                        keyspace.add_range(part);
-                    }
-                }
-                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
-            }
+            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }
@@ -367,7 +358,7 @@ impl LayerFringe {
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.push(keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
            }
            Entry::Vacant(entry) => {
                self.planned_reads_by_lsn.push(ReadDesc {
@@ -376,7 +367,7 @@ impl LayerFringe {
                });
                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: vec![keyspace],
+                    target_keyspace: keyspace,
                });
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -478,23 +478,6 @@ impl DeltaLayerWriterInner {
        key_end: Key,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
-        if result.is_err() {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
-            }
-        }
-        result
-    }
-
-    async fn finish0(
-        self,
-        key_end: Key,
-        timeline: &Arc<Timeline>,
-        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -668,11 +651,19 @@ impl DeltaLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+        let inner = self.inner.take().unwrap();
+        let temp_path = inner.path.clone();
+        let result = inner.finish(key_end, timeline, ctx).await;
+        // The delta layer files can sometimes be really large. Clean them up.
+        if result.is_err() {
+            tracing::warn!(
+                "Cleaning up temporary delta file {temp_path} after error during writing"
+            );
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
+            }
+        }
+        result
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -917,57 +917,26 @@ impl Drop for ImageLayerWriter {

 #[cfg(test)]
 mod test {
-    use std::time::Duration;
-
    use bytes::Bytes;
    use pageserver_api::{
        key::Key,
        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
    };
-    use utils::{
-        generation::Generation,
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
+    use utils::{id::TimelineId, lsn::Lsn};

-    use crate::{
-        tenant::{config::TenantConf, harness::TenantHarness},
-        DEFAULT_PG_VERSION,
-    };
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};

    use super::ImageLayerWriter;

    #[tokio::test]
    async fn image_layer_rewrite() {
-        let tenant_conf = TenantConf {
-            gc_period: Duration::ZERO,
-            compaction_period: Duration::ZERO,
-            ..TenantConf::default()
-        };
-        let tenant_id = TenantId::generate();
-        let mut gen = Generation::new(0xdead0001);
-        let mut get_next_gen = || {
-            let ret = gen;
-            gen = gen.next();
-            ret
-        };
+        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
        // The LSN at which we will create an image layer to filter
        let lsn = Lsn(0xdeadbeef0000);
+
        let timeline_id = TimelineId::generate();
-
-        //
-        // Create an unsharded parent with a layer.
-        //
-
-        let harness = TenantHarness::create_custom(
-            "test_image_layer_rewrite--parent",
-            tenant_conf.clone(),
-            tenant_id,
-            ShardIdentity::unsharded(),
-            get_next_gen(),
-        )
-        .unwrap();
-        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
            .await
@@ -1002,47 +971,9 @@ mod test {
        };
        let original_size = resident.metadata().file_size;

-        //
-        // Create child shards and do the rewrite, exercising filter().
-        // TODO: abstraction in TenantHarness for splits.
-        //
-
        // Filter for various shards: this exercises cases like values at start of key range, end of key
        // range, middle of key range.
-        let shard_count = ShardCount::new(4);
-        for shard_number in 0..shard_count.count() {
-            //
-            // mimic the shard split
-            //
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                shard_count,
-                ShardStripeSize(0x8000),
-            )
-            .unwrap();
-            let harness = TenantHarness::create_custom(
-                Box::leak(Box::new(format!(
-                    "test_image_layer_rewrite--child{}",
-                    shard_identity.shard_slug()
-                ))),
-                tenant_conf.clone(),
-                tenant_id,
-                shard_identity,
-                // NB: in reality, the shards would each fork off their own gen number sequence from the parent.
-                // But here, all we care about is that the gen number is unique.
-                get_next_gen(),
-            )
-            .unwrap();
-            let (tenant, ctx) = harness.load().await;
-            let timeline = tenant
-                .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
-                .await
-                .unwrap();
-
-            //
-            // use filter() and make assertions
-            //
-
+        for shard_number in 0..4 {
            let mut filtered_writer = ImageLayerWriter::new(
                harness.conf,
                timeline_id,
@@ -1054,6 +985,15 @@ mod test {
            .await
            .unwrap();

+            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
+            // to exercise filter()
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+
            let wrote_keys = resident
                .filter(&shard_identity, &mut filtered_writer, &ctx)
                .await
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -277,10 +277,9 @@ impl Layer {

        let downloaded = resident.expect("just initialized");

-        // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
-        // TODO: this leaves the temp file in place if the rename fails, risking us running
-        // out of space. Should we clean it up here or does the calling context deal with this?
-        utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
+        // if the rename works, the path is as expected
+        // TODO: sync system call
+        std::fs::rename(temp_path, owner.local_path())
            .with_context(|| format!("rename temporary file as correct path for {owner}"))?;

        Ok(ResidentLayer { downloaded, owner })
@@ -367,10 +366,7 @@ impl Layer {
            .0
            .get_or_maybe_download(true, Some(ctx))
            .await
-            .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
+            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;

        self.0
            .access_stats
@@ -1162,11 +1158,6 @@ impl LayerInner {
                let consecutive_failures =
                    1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);

-                if timeline.cancel.is_cancelled() {
-                    // If we're shutting down, drop out before logging the error
-                    return Err(e);
-                }
-
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");

                let backoff = utils::backoff::exponential_backoff_duration_seconds(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -380,28 +380,21 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                let res = tenant
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                    .await;
-                match res {
-                    Ok(_) => {
-                        error_run_count = 0;
-                        period
-                    }
-                    Err(crate::tenant::GcError::TenantCancelled) => {
-                        return;
-                    }
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-
-                        error!(
+                if let Err(e) = res {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    error!(
                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                    );
-                        wait_duration
-                    }
+                    wait_duration
+                } else {
+                    error_run_count = 0;
+                    period
                }
            };

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -102,6 +102,7 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
+use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;

@@ -130,17 +131,14 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

+use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
-use super::{
-    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
-    GcError,
-};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub(crate) enum FlushLoopState {
+pub(super) enum FlushLoopState {
    NotStarted,
    Running {
        #[cfg(test)]
@@ -498,11 +496,15 @@ pub(crate) enum PageReconstructError {
    Other(#[from] anyhow::Error),

    #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(WaitLsnError),
+    AncestorLsnTimeout(#[from] WaitLsnError),

    #[error("timeline shutting down")]
    Cancelled,

+    /// The ancestor of this is being stopped
+    #[error("ancestor timeline {0} is being stopped")]
+    AncestorStopping(TimelineId),
+
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(anyhow::Error),
@@ -567,7 +569,7 @@ impl PageReconstructError {
        match self {
            Other(_) => false,
            AncestorLsnTimeout(_) => false,
-            Cancelled => true,
+            Cancelled | AncestorStopping(_) => true,
            WalRedo(_) => false,
            MissingKey { .. } => false,
        }
@@ -575,7 +577,7 @@ impl PageReconstructError {
 }

 #[derive(thiserror::Error, Debug)]
-pub(crate) enum CreateImageLayersError {
+enum CreateImageLayersError {
    #[error("timeline shutting down")]
    Cancelled,

@@ -589,35 +591,17 @@ pub(crate) enum CreateImageLayersError {
    Other(#[from] anyhow::Error),
 }

-#[derive(thiserror::Error, Debug, Clone)]
-pub(crate) enum FlushLayerError {
+#[derive(thiserror::Error, Debug)]
+enum FlushLayerError {
    /// Timeline cancellation token was cancelled
    #[error("timeline shutting down")]
    Cancelled,

-    /// We tried to flush a layer while the Timeline is in an unexpected state
-    #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
-    NotRunning(FlushLoopState),
-
-    // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
-    // loop via a watch channel, where we can only borrow it.
    #[error(transparent)]
-    CreateImageLayersError(Arc<CreateImageLayersError>),
+    CreateImageLayersError(CreateImageLayersError),

    #[error(transparent)]
-    Other(#[from] Arc<anyhow::Error>),
-}
-
-impl FlushLayerError {
-    // When crossing from generic anyhow errors to this error type, we explicitly check
-    // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
-    fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
-        if timeline.cancel.is_cancelled() {
-            Self::Cancelled
-        } else {
-            Self::Other(Arc::new(err))
-        }
-    }
+    Other(#[from] anyhow::Error),
 }

 #[derive(thiserror::Error, Debug)]
@@ -643,17 +627,17 @@ pub(crate) enum GetVectoredError {

 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
+    #[error("ancestor timeline {0} is being stopped")]
+    AncestorStopping(TimelineId),
+
    #[error("Ancestor LSN wait error: {0}")]
    AncestorLsnTimeout(#[from] WaitLsnError),

-    #[error("Bad state on timeline {timeline_id}: {state:?}")]
-    BadState {
-        timeline_id: TimelineId,
-        state: TimelineState,
-    },
-
    #[error("Cancelled")]
    Cancelled,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
 }

 #[derive(Clone, Copy)]
@@ -688,8 +672,8 @@ pub(crate) enum WaitLsnError {
    Shutdown,

    // Called on an timeline not in active state or shutting down
-    #[error("Bad timeline state: {0:?}")]
-    BadState(TimelineState),
+    #[error("Bad state (not active)")]
+    BadState,

    // Timeout expired while waiting for LSN to catch up with goal.
    #[error("{0}")]
@@ -712,7 +696,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
-            any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
+            any => FlushLayerError::CreateImageLayersError(any),
        }
    }
 }
@@ -752,9 +736,10 @@ impl From<GetReadyAncestorError> for PageReconstructError {
    fn from(e: GetReadyAncestorError) -> Self {
        use GetReadyAncestorError::*;
        match e {
+            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
            AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
-            bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
            Cancelled => PageReconstructError::Cancelled,
+            Other(other) => PageReconstructError::Other(other),
        }
    }
 }
@@ -1186,7 +1171,9 @@ impl Timeline {

                use PageReconstructError::*;
                match block {
-                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
+                    Err(Cancelled | AncestorStopping(_)) => {
+                        return Err(GetVectoredError::Cancelled)
+                    }
                    Err(MissingKey(_))
                        if NON_INHERITED_RANGE.contains(&key)
                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
@@ -1461,11 +1448,10 @@ impl Timeline {
        who_is_waiting: WaitLsnWaiter<'_>,
        ctx: &RequestContext, /* Prepare for use by cancellation */
    ) -> Result<(), WaitLsnError> {
-        let state = self.current_state();
-        if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
+        if self.cancel.is_cancelled() {
            return Err(WaitLsnError::Shutdown);
-        } else if !matches!(state, TimelineState::Active) {
-            return Err(WaitLsnError::BadState(state));
+        } else if !self.is_active() {
+            return Err(WaitLsnError::BadState);
        }

        if cfg!(debug_assertions) {
@@ -1561,13 +1547,13 @@ impl Timeline {

    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
+    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
        self.freeze_and_flush0().await
    }

    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
-    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
+    pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
        let to_lsn = self.freeze_inmem_layer(false).await;
        self.flush_frozen_layers_and_wait(to_lsn).await
    }
@@ -2749,6 +2735,11 @@ impl Timeline {
            self.current_logical_size.initialized.add_permits(1);
        }

+        enum BackgroundCalculationError {
+            Cancelled,
+            Other(anyhow::Error),
+        }
+
        let try_once = |attempt: usize| {
            let background_ctx = &background_ctx;
            let self_ref = &self;
@@ -2766,10 +2757,10 @@ impl Timeline {
                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                    }
                    _ = self_ref.cancel.cancelled() => {
-                        return Err(CalculateLogicalSizeError::Cancelled);
+                        return Err(BackgroundCalculationError::Cancelled);
                    }
                    _ = cancel.cancelled() => {
-                        return Err(CalculateLogicalSizeError::Cancelled);
+                        return Err(BackgroundCalculationError::Cancelled);
                    },
                    () = skip_concurrency_limiter.cancelled() => {
                        // Some action that is part of a end user interaction requested logical size
@@ -2787,21 +2778,28 @@ impl Timeline {
                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
                };

-                let calculated_size = self_ref
+                match self_ref
                    .logical_size_calculation_task(
                        initial_part_end,
                        LogicalSizeCalculationCause::Initial,
                        background_ctx,
                    )
-                    .await?;
-
-                self_ref
-                    .trigger_aux_file_size_computation(initial_part_end, background_ctx)
-                    .await?;
-
-                // TODO: add aux file size to logical size
-
-                Ok((calculated_size, metrics_guard))
+                    .await
+                {
+                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
+                    Err(CalculateLogicalSizeError::Cancelled) => {
+                        Err(BackgroundCalculationError::Cancelled)
+                    }
+                    Err(CalculateLogicalSizeError::Other(err)) => {
+                        if let Some(PageReconstructError::AncestorStopping(_)) =
+                            err.root_cause().downcast_ref()
+                        {
+                            Err(BackgroundCalculationError::Cancelled)
+                        } else {
+                            Err(BackgroundCalculationError::Other(err))
+                        }
+                    }
+                }
            }
        };

@@ -2812,11 +2810,8 @@ impl Timeline {

                match try_once(attempt).await {
                    Ok(res) => return ControlFlow::Continue(res),
-                    Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
-                    Err(
-                        e @ (CalculateLogicalSizeError::Decode(_)
-                        | CalculateLogicalSizeError::PageRead(_)),
-                    ) => {
+                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
+                    Err(BackgroundCalculationError::Other(e)) => {
                        warn!(attempt, "initial size calculation failed: {e:?}");
                        // exponential back-off doesn't make sense at these long intervals;
                        // use fixed retry interval with generous jitter instead
@@ -3193,21 +3188,17 @@ impl Timeline {
            }

            // Recurse into ancestor if needed
-            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
-                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                    trace!(
-                        "going into ancestor {}, cont_lsn is {}",
-                        timeline.ancestor_lsn,
-                        cont_lsn
-                    );
+            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                trace!(
+                    "going into ancestor {}, cont_lsn is {}",
+                    timeline.ancestor_lsn,
+                    cont_lsn
+                );

-                    timeline_owned = timeline
-                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                        .await?;
-                    timeline = &*timeline_owned;
-                    prev_lsn = None;
-                    continue 'outer;
-                }
+                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
+                timeline = &*timeline_owned;
+                prev_lsn = None;
+                continue 'outer;
            }

            let guard = timeline.layers.read().await;
@@ -3356,10 +3347,10 @@ impl Timeline {
                break None;
            }

-            let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
-                // Not fully retrieved but no ancestor timeline.
+            // Not fully retrieved but no ancestor timeline.
+            if timeline.ancestor_timeline.is_none() {
                break Some(keyspace);
-            };
+            }

            // Now we see if there are keys covered by the image layer but does not exist in the
            // image layer, which means that the key does not exist.
@@ -3379,7 +3370,7 @@ impl Timeline {
            // Take the min to avoid reconstructing a page with data newer than request Lsn.
            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
            timeline_owned = timeline
-                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                .get_ready_ancestor_timeline(ctx)
                .await
                .map_err(GetVectoredError::GetReadyAncestorError)?;
            timeline = &*timeline_owned;
@@ -3551,9 +3542,13 @@ impl Timeline {

    async fn get_ready_ancestor_timeline(
        &self,
-        ancestor: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
+        let ancestor = match self.get_ancestor_timeline() {
+            Ok(timeline) => timeline,
+            Err(e) => return Err(GetReadyAncestorError::from(e)),
+        };
+
        // It's possible that the ancestor timeline isn't active yet, or
        // is active but hasn't yet caught up to the branch point. Wait
        // for it.
@@ -3581,14 +3576,16 @@ impl Timeline {
        match ancestor.wait_to_become_active(ctx).await {
            Ok(()) => {}
            Err(TimelineState::Stopping) => {
-                // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
-                return Err(GetReadyAncestorError::Cancelled);
+                return Err(GetReadyAncestorError::AncestorStopping(
+                    ancestor.timeline_id,
+                ));
            }
            Err(state) => {
-                return Err(GetReadyAncestorError::BadState {
-                    timeline_id: ancestor.timeline_id,
-                    state,
-                });
+                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
+                    "Timeline {} will not become active. Current state: {:?}",
+                    ancestor.timeline_id,
+                    &state,
+                )));
            }
        }
        ancestor
@@ -3597,17 +3594,21 @@ impl Timeline {
            .map_err(|e| match e {
                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
                WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
-                WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
-                    timeline_id: ancestor.timeline_id,
-                    state,
-                },
+                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
            })?;

-        Ok(ancestor.clone())
+        Ok(ancestor)
    }

-    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
-        self.ancestor_timeline.clone()
+    pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
+        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
+            format!(
+                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
+                self.timeline_id,
+                self.get_ancestor_timeline_id(),
+            )
+        })?;
+        Ok(Arc::clone(ancestor))
    }

    pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
@@ -3716,9 +3717,7 @@ impl Timeline {
                        return;
                    }
                    err @ Err(
-                        FlushLayerError::NotRunning(_)
-                        | FlushLayerError::Other(_)
-                        | FlushLayerError::CreateImageLayersError(_),
+                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                    ) => {
                        error!("could not flush frozen layer: {err:?}");
                        break err.map(|_| ());
@@ -3764,10 +3763,7 @@ impl Timeline {
    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(
-        &self,
-        last_record_lsn: Lsn,
-    ) -> Result<(), FlushLayerError> {
+    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
        let mut rx = self.layer_flush_done_tx.subscribe();

        // Increment the flush cycle counter and wake up the flush task.
@@ -3778,7 +3774,7 @@ impl Timeline {

        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            return Err(FlushLayerError::NotRunning(flush_loop_state));
+            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
@@ -3791,11 +3787,14 @@ impl Timeline {
            {
                let (last_result_counter, last_result) = &*rx.borrow();
                if *last_result_counter >= my_flush_request {
-                    if let Err(err) = last_result {
+                    if let Err(_err) = last_result {
                        // We already logged the original error in
                        // flush_loop. We cannot propagate it to the caller
                        // here, because it might not be Cloneable
-                        return Err(err.clone());
+                        anyhow::bail!(
+                            "Could not flush frozen layer. Request id: {}",
+                            my_flush_request
+                        );
                    } else {
                        return Ok(());
                    }
@@ -3804,7 +3803,7 @@ impl Timeline {
            trace!("waiting for flush to complete");
            tokio::select! {
                rx_e = rx.changed() => {
-                    rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
+                    rx_e?;
                },
                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
                // the notification from [`flush_loop`] that it completed.
@@ -3876,36 +3875,31 @@ impl Timeline {
                    EnumSet::empty(),
                    ctx,
                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
+                .await?;

            if self.cancel.is_cancelled() {
                return Err(FlushLayerError::Cancelled);
            }

-            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
-            // This code path will not be hit during regression tests. After #7099 we have a single partition
-            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
-            // to be fixed.
-
            // For metadata, always create delta layers.
            let delta_layer = if !metadata_partition.parts.is_empty() {
                assert_eq!(
                    metadata_partition.parts.len(),
                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
+                    "currently sparse keyspace should only contain a single aux file keyspace"
                );
                let metadata_keyspace = &metadata_partition.parts[0];
+                assert_eq!(
+                    metadata_keyspace.0.ranges.len(),
+                    1,
+                    "aux file keyspace should be a single range"
+                );
                self.create_delta_layer(
                    &frozen_layer,
-                    Some(
-                        metadata_keyspace.0.ranges.first().unwrap().start
-                            ..metadata_keyspace.0.ranges.last().unwrap().end,
-                    ),
+                    Some(metadata_keyspace.0.ranges[0].clone()),
                    ctx,
                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+                .await?
            } else {
                None
            };
@@ -3932,11 +3926,7 @@ impl Timeline {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
            // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self
-                .create_delta_layer(&frozen_layer, None, ctx)
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
-            else {
+            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
                panic!("delta layer cannot be empty if no filter is applied");
            };
            (
@@ -3969,8 +3959,7 @@ impl Timeline {

            if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
-                    .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
            }
            // release lock on 'layers'
        };
@@ -4268,7 +4257,7 @@ impl Timeline {
                                // Unfortunately we cannot do this for the main fork, or for
                                // any metadata keys, keys, as that would lead to actual data
                                // loss.
-                                if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() {
+                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
                                    warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
                                    ZERO_PAGE.clone()
                                } else {
@@ -4318,7 +4307,6 @@ impl Timeline {
        ctx: &RequestContext,
        img_range: Range<Key>,
        mode: ImageLayerCreationMode,
-        start: Key,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        assert!(!matches!(mode, ImageLayerCreationMode::Initial));

@@ -4327,43 +4315,39 @@ impl Timeline {
        let data = self
            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
            .await?;
-        let (data, total_kb_retrieved, total_keys_retrieved) = {
+        let (data, total_kb_retrieved, total_key_retrieved) = {
            let mut new_data = BTreeMap::new();
            let mut total_kb_retrieved = 0;
-            let mut total_keys_retrieved = 0;
+            let mut total_key_retrieved = 0;
            for (k, v) in data {
                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
                total_kb_retrieved += KEY_SIZE + v.len();
-                total_keys_retrieved += 1;
+                total_key_retrieved += 1;
                new_data.insert(k, v);
            }
-            (new_data, total_kb_retrieved / 1024, total_keys_retrieved)
+            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
        };
-        let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
+        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();

-        let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
        debug!(
-            trigger_generation,
-            delta_files_accessed,
-            total_kb_retrieved,
-            total_keys_retrieved,
-            "generate metadata images"
+            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
+                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
+                total_key_retrieved={total_key_retrieved}"
        );
-
        if !trigger_generation && mode == ImageLayerCreationMode::Try {
            return Ok(ImageLayerCreationOutcome {
                image: None,
                next_start_key: img_range.end,
            });
        }
-        let mut wrote_any_image = false;
+        let has_keys = !data.is_empty();
        for (k, v) in data {
-            if v.is_empty() {
-                // the key has been deleted, it does not need an image
-                // in metadata keyspace, an empty image == tombstone
-                continue;
-            }
-            wrote_any_image = true;
+            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
+            // considers this situation properly.
+            // if v.is_empty() {
+            //     continue;
+            // }

            // No need to handle sharding b/c metadata keys are always on the 0-th shard.

@@ -4371,26 +4355,16 @@ impl Timeline {
            // on the normal data path either.
            image_layer_writer.put_image(k, v, ctx).await?;
        }
-
-        if wrote_any_image {
-            // Normal path: we have written some data into the new image layer for this
-            // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
-            Ok(ImageLayerCreationOutcome {
-                image: Some(image_layer),
-                next_start_key: img_range.end,
-            })
-        } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard. In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
-            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                image: None,
-                next_start_key: start,
-            })
-        }
+        Ok(ImageLayerCreationOutcome {
+            image: if has_keys {
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
+                Some(image_layer)
+            } else {
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                None
+            },
+            next_start_key: img_range.end,
+        })
    }

    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
@@ -4500,7 +4474,6 @@ impl Timeline {
                        ctx,
                        img_range,
                        mode,
-                        start,
                    )
                    .await?;
                start = next_start_key;
@@ -4862,7 +4835,7 @@ impl Timeline {
    /// Currently, we don't make any attempt at removing unneeded page versions
    /// within a layer file. We can only remove the whole file if it's fully
    /// obsolete.
-    pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
+    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
        // this is most likely the background tasks, but it might be the spawned task from
        // immediate_gc
        let _g = tokio::select! {
@@ -4875,7 +4848,7 @@ impl Timeline {

        // Is the timeline being deleted?
        if self.is_stopping() {
-            return Err(GcError::TimelineCancelled);
+            anyhow::bail!("timeline is Stopping");
        }

        let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
@@ -4933,7 +4906,7 @@ impl Timeline {
        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        new_gc_cutoff: Lsn,
-    ) -> Result<GcResult, GcError> {
+    ) -> anyhow::Result<GcResult> {
        // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc

        let now = SystemTime::now();
@@ -4955,15 +4928,12 @@ impl Timeline {
        // The GC cutoff should only ever move forwards.
        let waitlist = {
            let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
-            if *write_guard > new_gc_cutoff {
-                return Err(GcError::BadLsn {
-                    why: format!(
-                        "Cannot move GC cutoff LSN backwards (was {}, new {})",
-                        *write_guard, new_gc_cutoff
-                    ),
-                });
-            }
-
+            ensure!(
+                *write_guard <= new_gc_cutoff,
+                "Cannot move GC cutoff LSN backwards (was {}, new {})",
+                *write_guard,
+                new_gc_cutoff
+            );
            write_guard.store_and_unlock(new_gc_cutoff)
        };
        waitlist.wait().await;
@@ -5072,14 +5042,7 @@ impl Timeline {
            // This unconditionally schedules also an index_part.json update, even though, we will
            // be doing one a bit later with the unlinked gc'd layers.
            let disk_consistent_lsn = self.disk_consistent_lsn.load();
-            self.schedule_uploads(disk_consistent_lsn, None)
-                .map_err(|e| {
-                    if self.cancel.is_cancelled() {
-                        GcError::TimelineCancelled
-                    } else {
-                        GcError::Remote(e)
-                    }
-                })?;
+            self.schedule_uploads(disk_consistent_lsn, None)?;

            let gc_layers = layers_to_remove
                .iter()
@@ -5088,15 +5051,7 @@ impl Timeline {

            result.layers_removed = gc_layers.len() as u64;

-            self.remote_client
-                .schedule_gc_update(&gc_layers)
-                .map_err(|e| {
-                    if self.cancel.is_cancelled() {
-                        GcError::TimelineCancelled
-                    } else {
-                        GcError::Remote(e)
-                    }
-                })?;
+            self.remote_client.schedule_gc_update(&gc_layers)?;

            guard.finish_gc_timeline(&gc_layers);

@@ -5111,7 +5066,7 @@ impl Timeline {
            result.layers_removed, new_gc_cutoff
        );

-        result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
+        result.elapsed = now.elapsed()?;
        Ok(result)
    }

@@ -5403,133 +5358,6 @@ impl Timeline {
            shard_count: self.tenant_shard_id.shard_count,
        }
    }
-
-    #[cfg(test)]
-    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
-        self.last_record_lsn.advance(new_lsn);
-    }
-
-    /// Force create an image layer and place it into the layer map.
-    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
-    #[cfg(test)]
-    pub(super) async fn force_create_image_layer(
-        self: &Arc<Timeline>,
-        lsn: Lsn,
-        mut images: Vec<(Key, Bytes)>,
-        check_start_lsn: Option<Lsn>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let last_record_lsn = self.get_last_record_lsn();
-        assert!(
-            lsn <= last_record_lsn,
-            "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
-        );
-        if let Some(check_start_lsn) = check_start_lsn {
-            assert!(lsn >= check_start_lsn);
-        }
-        images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
-        let min_key = *images.first().map(|(k, _)| k).unwrap();
-        let max_key = images.last().map(|(k, _)| k).unwrap().next();
-        let mut image_layer_writer = ImageLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            &(min_key..max_key),
-            lsn,
-            ctx,
-        )
-        .await?;
-        for (key, img) in images {
-            image_layer_writer.put_image(key, img, ctx).await?;
-        }
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
-
-        {
-            let mut guard = self.layers.write().await;
-            guard.force_insert_layer(image_layer);
-        }
-
-        Ok(())
-    }
-
-    /// Force create a delta layer and place it into the layer map.
-    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
-    #[cfg(test)]
-    pub(super) async fn force_create_delta_layer(
-        self: &Arc<Timeline>,
-        mut deltas: Vec<(Key, Lsn, Value)>,
-        check_start_lsn: Option<Lsn>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let last_record_lsn = self.get_last_record_lsn();
-        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
-        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
-        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
-        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
-        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
-        assert!(
-            max_lsn <= last_record_lsn,
-            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
-        );
-        let end_lsn = Lsn(max_lsn.0 + 1);
-        if let Some(check_start_lsn) = check_start_lsn {
-            assert!(min_lsn >= check_start_lsn);
-        }
-        let mut delta_layer_writer = DeltaLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            min_key,
-            min_lsn..end_lsn,
-            ctx,
-        )
-        .await?;
-        for (key, lsn, val) in deltas {
-            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-        }
-        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
-
-        {
-            let mut guard = self.layers.write().await;
-            guard.force_insert_layer(delta_layer);
-        }
-
-        Ok(())
-    }
-
-    /// Return all keys at the LSN in the image layers
-    #[cfg(test)]
-    pub(crate) async fn inspect_image_layers(
-        self: &Arc<Timeline>,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
-        let mut all_data = Vec::new();
-        let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
-            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
-                let layer = guard.get_from_desc(&layer);
-                let mut reconstruct_data = ValuesReconstructState::default();
-                layer
-                    .get_values_reconstruct_data(
-                        KeySpace::single(Key::MIN..Key::MAX),
-                        lsn..Lsn(lsn.0 + 1),
-                        &mut reconstruct_data,
-                        ctx,
-                    )
-                    .await?;
-                for (k, v) in reconstruct_data.keys {
-                    all_data.push((k, v?.img.unwrap().1));
-                }
-            }
-        }
-        all_data.sort();
-        Ok(all_data)
-    }
 }

 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -133,7 +133,8 @@ impl Timeline {
                        },
                        &image_ctx,
                    )
-                    .await?;
+                    .await
+                    .map_err(anyhow::Error::from)?;

                self.upload_new_image_layers(image_layers)?;
                partitioning.parts.len()
@@ -421,6 +422,48 @@ impl Timeline {
            return Ok(CompactLevel0Phase1Result::default());
        }

+        // This failpoint is used together with `test_duplicate_layers` integration test.
+        // It returns the compaction result exactly the same layers as input to compaction.
+        // We want to ensure that this will not cause any problem when updating the layer map
+        // after the compaction is finished.
+        //
+        // Currently, there are two rare edge cases that will cause duplicated layers being
+        // inserted.
+        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
+        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
+        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
+        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
+        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
+        //    layer replace instead of the normal remove / upload process.
+        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
+        //    size length. Compaction will likely create the same set of n files afterwards.
+        //
+        // This failpoint is a superset of both of the cases.
+        if cfg!(feature = "testing") {
+            let active = (|| {
+                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
+                false
+            })();
+
+            if active {
+                let mut new_layers = Vec::with_capacity(level0_deltas.len());
+                for delta in &level0_deltas {
+                    // we are just faking these layers as being produced again for this failpoint
+                    new_layers.push(
+                        delta
+                            .download_and_keep_resident()
+                            .await
+                            .context("download layer for failpoint")?,
+                    );
+                }
+                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
+                return Ok(CompactLevel0Phase1Result {
+                    new_layers,
+                    deltas_to_compact: level0_deltas,
+                });
+            }
+        }
+
        // Gather the files to compact in this iteration.
        //
        // Start with the oldest Level 0 delta file, and collect any other
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;

-use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
+use super::{layer_manager::LayerManager, Timeline};
 use crate::{
    context::{DownloadBehavior, RequestContext},
    task_mgr::TaskKind,
@@ -23,7 +23,7 @@ pub(crate) enum Error {
    #[error("shutting down, please retry later")]
    ShuttingDown,
    #[error("flushing failed")]
-    FlushAncestor(#[source] FlushLayerError),
+    FlushAncestor(#[source] anyhow::Error),
    #[error("layer download failed")]
    RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
    #[error("copying LSN prefix locally failed")]
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,13 +255,6 @@ impl LayerManager {
        updates.flush()
    }

-    #[cfg(test)]
-    pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
-        let mut updates = self.layer_map.batch_update();
-        Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-        updates.flush()
-    }
-
    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
        layer: Layer,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,10 +3,12 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -43,25 +45,34 @@ pub(crate) struct UploadQueueInitialized {
    /// Counter to assign task IDs
    pub(crate) task_counter: u64,

-    /// The next uploaded index_part.json; assumed to be dirty.
-    ///
-    /// Should not be read, directly except for layer file updates. Instead you should add a
-    /// projected field.
-    pub(crate) dirty: IndexPart,
-
-    /// The latest remote persisted IndexPart.
-    ///
-    /// Each completed metadata upload will update this. The second item is the task_id which last
-    /// updated the value, used to ensure we never store an older value over a newer one.
-    pub(crate) clean: (IndexPart, Option<u64>),
+    /// All layer files stored in the remote storage, taking into account all
+    /// in-progress and queued operations
+    pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,

    /// How many file uploads or deletions been scheduled, since the
    /// last (scheduling of) metadata index upload?
    pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,

-    /// The Lsn is only updated after our generation has been validated with
+    /// Metadata stored in the remote storage, taking into account all
+    /// in-progress and queued operations.
+    /// DANGER: do not return to outside world, e.g., safekeepers.
+    pub(crate) latest_metadata: TimelineMetadata,
+
+    /// Part of the flattened "next" `index_part.json`.
+    pub(crate) latest_lineage: Lineage,
+
+    /// The last aux file policy used on this timeline.
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
+    /// `disk_consistent_lsn` from the last metadata file that was successfully
+    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
+    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
+    /// Safekeeper can rely on it to make decisions for WAL storage.
+    ///
+    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
    /// the control plane (unlesss a timeline's generation is None, in which case
    /// we skip validation)
+    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
    pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,

    // Breakdown of different kinds of tasks currently in-progress
@@ -107,8 +118,7 @@ impl UploadQueueInitialized {
    }

    pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        let lsn = self.clean.0.metadata.disk_consistent_lsn();
-        self.clean.1.map(|_| lsn)
+        self.projected_remote_consistent_lsn
    }
 }

@@ -164,12 +174,13 @@ impl UploadQueue {

        info!("initializing upload queue for empty remote");

-        let index_part = IndexPart::empty(metadata.clone());
-
        let state = UploadQueueInitialized {
-            dirty: index_part.clone(),
-            clean: (index_part, None),
+            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
+            latest_files: HashMap::new(),
            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: metadata.clone(),
+            latest_lineage: Lineage::default(),
+            projected_remote_consistent_lsn: None,
            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
            // what follows are boring default initializations
            task_counter: 0,
@@ -182,6 +193,7 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: Default::default(),
        };

        *self = UploadQueue::Initialized(state);
@@ -199,15 +211,22 @@ impl UploadQueue {
            }
        }

+        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
+        for (layer_name, layer_metadata) in &index_part.layer_metadata {
+            files.insert(layer_name.to_owned(), layer_metadata.clone());
+        }
+
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
            index_part.metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
-            dirty: index_part.clone(),
-            clean: (index_part.clone(), None),
+            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: index_part.metadata.clone(),
+            latest_lineage: index_part.lineage.clone(),
+            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
            visible_remote_consistent_lsn: Arc::new(
                index_part.metadata.disk_consistent_lsn().into(),
            ),
@@ -222,6 +241,7 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: index_part.last_aux_file_policy(),
        };

        *self = UploadQueue::Initialized(state);
@@ -278,16 +298,13 @@ pub(crate) enum UploadOp {
    /// Upload a layer file
    UploadLayer(ResidentLayer, LayerFileMetadata),

-    /// Upload a index_part.json file
-    UploadMetadata {
-        /// The next [`UploadQueueInitialized::clean`] after this upload succeeds.
-        uploaded: Box<IndexPart>,
-    },
+    /// Upload the metadata file
+    UploadMetadata(Box<IndexPart>, Lsn),

    /// Delete layer files
    Delete(Delete),

-    /// Barrier. When the barrier operation is reached, the channel is closed.
+    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),

    /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
@@ -305,12 +322,8 @@ impl std::fmt::Display for UploadOp {
                    layer, metadata.file_size, metadata.generation
                )
            }
-            UploadOp::UploadMetadata { uploaded, .. } => {
-                write!(
-                    f,
-                    "UploadMetadata(lsn: {})",
-                    uploaded.metadata.disk_consistent_lsn()
-                )
+            UploadOp::UploadMetadata(_, lsn) => {
+                write!(f, "UploadMetadata(lsn: {})", lsn)
            }
            UploadOp::Delete(delete) => {
                write!(f, "Delete({} layers)", delete.layers.len())
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -234,7 +234,6 @@ impl WalIngest {
                        modification,
                        &parsed_xact,
                        info == pg_constants::XLOG_XACT_COMMIT,
-                        decoded.origin_id,
                        ctx,
                    )
                    .await?;
@@ -247,7 +246,6 @@ impl WalIngest {
                        modification,
                        &parsed_xact,
                        info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                        decoded.origin_id,
                        ctx,
                    )
                    .await?;
@@ -377,18 +375,6 @@ impl WalIngest {
                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
                }
            }
-            pg_constants::RM_REPLORIGIN_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                if info == pg_constants::XLOG_REPLORIGIN_SET {
-                    let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf);
-                    modification
-                        .set_replorigin(xlrec.node_id, xlrec.remote_lsn)
-                        .await?
-                } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
-                    let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf);
-                    modification.drop_replorigin(xlrec.node_id).await?
-                }
-            }
            _x => {
                // TODO: should probably log & fail here instead of blindly
                // doing something without understanding the protocol
@@ -1192,7 +1178,6 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        parsed: &XlXactParsedRecord,
        is_commit: bool,
-        origin_id: u16,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Record update of CLOG pages
@@ -1258,11 +1243,6 @@ impl WalIngest {
                }
            }
        }
-        if origin_id != 0 {
-            modification
-                .set_replorigin(origin_id, parsed.origin_lsn)
-                .await?;
-        }
        Ok(())
    }

--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -9,10 +9,10 @@ use postgres_ffi::pg_constants;
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{BlockNumber, TimestampTz};
 use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
-use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
+use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
 use serde::{Deserialize, Serialize};
 use tracing::*;
-use utils::{bin_ser::DeserializeError, lsn::Lsn};
+use utils::bin_ser::DeserializeError;

 /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
 /// around a PostgreSQL WAL record, or a custom neon-specific "record".
@@ -116,7 +116,6 @@ pub struct DecodedWALRecord {

    pub blocks: Vec<DecodedBkpBlock>,
    pub main_data_offset: usize,
-    pub origin_id: u16,
 }

 #[repr(C)]
@@ -574,7 +573,6 @@ pub struct XlXactParsedRecord {
    pub subxacts: Vec<TransactionId>,

    pub xnodes: Vec<RelFileNode>,
-    pub origin_lsn: Lsn,
 }

 impl XlXactParsedRecord {
@@ -653,11 +651,6 @@ impl XlXactParsedRecord {
            debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
        }

-        let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 {
-            Lsn(buf.get_u64_le())
-        } else {
-            Lsn::INVALID
-        };
        XlXactParsedRecord {
            xid,
            info,
@@ -667,7 +660,6 @@ impl XlXactParsedRecord {
            ts_id,
            subxacts,
            xnodes,
-            origin_lsn,
        }
    }
 }
@@ -818,36 +810,6 @@ impl XlRunningXacts {
    }
 }

-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginDrop {
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginDrop {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
-        XlReploriginDrop {
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginSet {
-    pub remote_lsn: Lsn,
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginSet {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
-        XlReploriginSet {
-            remote_lsn: Lsn(buf.get_u64_le()),
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
@@ -882,7 +844,6 @@ pub fn decode_wal_record(
    let mut rnode_dbnode: u32 = 0;
    let mut rnode_relnode: u32 = 0;
    let mut got_rnode = false;
-    let mut origin_id: u16 = 0;

    let mut buf = record.clone();

@@ -930,7 +891,7 @@ pub fn decode_wal_record(

            pg_constants::XLR_BLOCK_ID_ORIGIN => {
                // RepOriginId is uint16
-                origin_id = buf.get_u16_le();
+                buf.advance(2);
            }

            pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
@@ -1127,7 +1088,6 @@ pub fn decode_wal_record(
    decoded.xl_info = xlogrec.xl_info;
    decoded.xl_rmid = xlogrec.xl_rmid;
    decoded.record = record;
-    decoded.origin_id = origin_id;
    decoded.main_data_offset = main_data_offset;

    Ok(())
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,6 +20,7 @@

 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
+pub use process::Kind as ProcessKind;

 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -33,6 +34,7 @@ use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
+use pageserver_api::key::key_to_rel_block;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
@@ -53,7 +55,7 @@ pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// The current [`process::Process`] that is used by new redo requests.
    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
    /// their process object; we use [`Arc::clone`] for that.
@@ -65,7 +67,7 @@ pub struct PostgresRedoManager {
    /// still be using the old redo process. But, those other tasks will most likely
    /// encounter an error as well, and errors are an unexpected condition anyway.
    /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
 }

 ///
@@ -206,35 +208,30 @@ impl PostgresRedoManager {
    ) -> anyhow::Result<Bytes> {
        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());

-        let (rel, blknum) = key.to_rel_block().context("invalid record")?;
+        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
+            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => Arc::clone(&guard),
+                Err(permit) => {
+                    // don't hold poison_guard, the launch code can bail
+                    let start = Instant::now();
+                    let proc = Arc::new(
+                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
                            .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                    );
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process.set(Arc::clone(&proc), permit);
+                    proc
+                }
+            };

            let started_at = std::time::Instant::now();

@@ -365,10 +362,10 @@ impl PostgresRedoManager {
        &self,
        key: Key,
        page: &mut BytesMut,
-        record_lsn: Lsn,
+        _record_lsn: Lsn,
        record: &NeonWalRecord,
    ) -> anyhow::Result<()> {
-        apply_neon::apply_in_neon(record, record_lsn, key, page)?;
+        apply_neon::apply_in_neon(record, key, page)?;

        Ok(())
    }
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -3,7 +3,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, BytesMut};
-use pageserver_api::key::Key;
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -14,7 +14,6 @@ use postgres_ffi::v14::nonrelfile_utils::{
 use postgres_ffi::BLCKSZ;
 use tracing::*;
 use utils::bin_ser::BeSer;
-use utils::lsn::Lsn;

 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -33,7 +32,6 @@ pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {

 pub(crate) fn apply_in_neon(
    record: &NeonWalRecord,
-    lsn: Lsn,
    key: Key,
    page: &mut BytesMut,
 ) -> Result<(), anyhow::Error> {
@@ -50,7 +48,7 @@ pub(crate) fn apply_in_neon(
            flags,
        } => {
            // sanity check that this is modifying the correct relation
-            let (rel, blknum) = key.to_rel_block().context("invalid record")?;
+            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
            assert!(
                rel.forknum == VISIBILITYMAP_FORKNUM,
                "ClearVisibilityMapFlags record on unexpected rel {}",
@@ -69,7 +67,6 @@ pub(crate) fn apply_in_neon(
                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];

                map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
            }

            // Repeat for 'old_heap_blkno', if any
@@ -83,13 +80,12 @@ pub(crate) fn apply_in_neon(
                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];

                map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
            }
        }
        // Non-relational WAL records are handled here, with custom code that has the
        // same effects as the corresponding Postgres WAL redo function.
        NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::Clog,
@@ -134,7 +130,7 @@ pub(crate) fn apply_in_neon(
            }
        }
        NeonWalRecord::ClogSetAborted { xids } => {
-            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::Clog,
@@ -164,7 +160,7 @@ pub(crate) fn apply_in_neon(
            }
        }
        NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::MultiXactOffsets,
@@ -196,7 +192,7 @@ pub(crate) fn apply_in_neon(
            LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
        }
        NeonWalRecord::MultixactMembersCreate { moff, members } => {
-            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::MultiXactMembers,
@@ -289,7 +285,7 @@ mod test {
        let mut page = BytesMut::from_iter(base_image);

        for record in deltas {
-            apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
+            apply_in_neon(&record, file_path, &mut page)?;
        }

        let reconstructed = AuxFilesDirectory::des(&page)?;
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,184 +1,64 @@
+/// Layer of indirection previously used to support multiple implementations.
+/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
+use std::time::Duration;
+
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use tracing::warn;
+use utils::lsn::Lsn;
+
+use crate::{config::PageServerConf, walrecord::NeonWalRecord};
+
 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;

-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
-use bytes::Bytes;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    process::{Command, Stdio},
-    time::Duration,
-};
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, poison::Poison};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
-    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
+mod process_impl {
+    pub(super) mod process_async;
 }

-struct ProcessInput {
-    stdin: tokio::process::ChildStdin,
-    n_requests: usize,
+#[derive(
+    Clone,
+    Copy,
+    Debug,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum Kind {
+    Sync,
+    Async,
 }

-struct ProcessOutput {
-    stdout: tokio::process::ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
+pub(crate) struct Process(process_impl::process_async::WalRedoProcess);

-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
+impl Process {
+    #[inline(always)]
+    pub fn launch(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        let stdin =
-            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
-        let stdout = tokio::process::ChildStdout::from_std(stdout)
-            .context("convert to tokio::ChildStdout")?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
+        if conf.walredo_process_kind != Kind::Async {
+            warn!(
+                configured = %conf.walredo_process_kind,
+                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
+            );
+        }
+        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
            conf,
            tenant_shard_id,
-            child: Some(child),
-            stdin: tokio::sync::Mutex::new(Poison::new(
-                "stdin",
-                ProcessInput {
-                    stdin,
-                    n_requests: 0,
-                },
-            )),
-            stdout: tokio::sync::Mutex::new(Poison::new(
-                "stdout",
-                ProcessOutput {
-                    stdout,
-                    pending_responses: VecDeque::new(),
-                    n_processed_responses: 0,
-                },
-            )),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
+            pg_version,
+        )?))
    }

-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    /// Apply given WAL records ('records') over an old page image. Returns
-    /// new page image.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// Cancellation safe.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    #[inline(always)]
    pub(crate) async fn apply_wal_records(
        &self,
        rel: RelTag,
@@ -187,191 +67,12 @@ impl WalRedoProcess {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let Ok(res) =
-            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
-        else {
-            anyhow::bail!("WAL redo timed out");
-        };
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
+        self.0
+            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+            .await
    }

-    /// # Cancel-Safety
-    ///
-    /// When not polled to completion (e.g. because in `tokio::select!` another
-    /// branch becomes ready before this future), concurrent and subsequent
-    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
-    /// Dispose of this process instance and create a new one.
-    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
-        let request_no = {
-            let mut lock_guard = self.stdin.lock().await;
-            let mut poison_guard = lock_guard.check_and_arm()?;
-            let input = poison_guard.data_mut();
-            input
-                .stdin
-                .write_all(writebuf)
-                .await
-                .context("write to walredo stdin")?;
-            let request_no = input.n_requests;
-            input.n_requests += 1;
-            poison_guard.disarm();
-            request_no
-        };
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut lock_guard = self.stdout.lock().await;
-        let mut poison_guard = lock_guard.check_and_arm()?;
-        let output = poison_guard.data_mut();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            output
-                .stdout
-                .read_exact(&mut resultbuf)
-                .await
-                .context("read walredo stdout")?;
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        poison_guard.disarm();
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        use std::io::Write;
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
+    pub(crate) fn id(&self) -> u32 {
+        self.0.id()
    }
 }
--- a/pageserver/src/walredo/process/process_impl/process_async.rs
+++ b/pageserver/src/walredo/process/process_impl/process_async.rs
@@ -0,0 +1,374 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
 	}
 	else if (state->wre_errno == ENOENT)
 	{
-		nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr), count);
+		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr));
 		return NeonWALReadRemote(state, buf, startptr, count, tli);
 	}
 	else
@@ -614,7 +614,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 		uint32		startoff;
 		int			segbytes;
 		int			readbytes;
-		XLogSegNo	lastRemovedSegNo;

 		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);

@@ -690,23 +689,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 			return false;
 		}

-		/*
-		 * Recheck that the segment hasn't been removed while we were reading
-		 * it.
-		 */
-		lastRemovedSegNo = XLogGetLastRemovedSegno();
-		if (state->seg.ws_segno <= lastRemovedSegNo)
-		{
-			char		fname[MAXFNAMELEN];
-
-			state->wre_errno = ENOENT;
-
-			XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
-			snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
-					 fname, lastRemovedSegNo);
-			return false;
-		}
-
 		/* Update state for read */
 		recptr += readbytes;
 		nbytes -= readbytes;
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -35,7 +35,7 @@ use crate::{
    },
    stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -100,7 +100,6 @@ pub(super) async fn authenticate(
        .dbname(&db_info.dbname)
        .user(&db_info.user);

-    ctx.set_dbname(db_info.dbname.into());
    ctx.set_user(db_info.user.into());
    ctx.set_project(db_info.aux.clone());
    info!("woken up a compute node");
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,6 +11,7 @@ use crate::{
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
+use smol_str::SmolStr;
 use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
@@ -95,6 +96,13 @@ impl ComputeUserInfoMaybeEndpoint {
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
        let user: RoleName = get_param("user")?.into();

+        // record the values if we have them
+        ctx.set_application(params.get("application_name").map(SmolStr::from));
+        ctx.set_user(user.clone());
+        if let Some(dbname) = params.get("database") {
+            ctx.set_dbname(dbname.into());
+        }
+
        // Project name might be passed via PG's command-line options.
        let endpoint_option = params
            .options_raw()
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -452,7 +452,7 @@ pub struct ApiLocks<K> {

 #[derive(Debug, thiserror::Error)]
 pub enum ApiLockError {
-    #[error("timeout acquiring resource permit")]
+    #[error("permit could not be acquired")]
    TimeoutError(#[from] tokio::time::error::Elapsed),
 }

@@ -504,7 +504,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                    .clone()
            }
        };
-        let permit = semaphore.acquire_timeout(self.timeout).await;
+        let permit = semaphore.acquire_deadline(now + self.timeout).await;

        self.metrics
            .semaphore_acquire_seconds
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -13,7 +13,7 @@ use crate::{
    http,
    metrics::{CacheOutcome, Metrics},
    rate_limiter::EndpointRateLimiter,
-    scram, EndpointCacheKey,
+    scram, EndpointCacheKey, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -281,6 +281,14 @@ impl super::Api for Api {
            return Ok(cached);
        }

+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize().into(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
        let permit = self.locks.get_permit(&key).await?;

        // after getting back a permit - it's possible the cache was filled
@@ -293,15 +301,6 @@ impl super::Api for Api {
            }
        }

-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize_intern(), 1)
-        {
-            info!(key = &*key, "found cached compute node info");
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
        ctx.set_project(node.aux.clone());
        let cold_start_info = node.aux.cold_start_info;
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -2,7 +2,6 @@

 use chrono::Utc;
 use once_cell::sync::OnceCell;
-use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
@@ -47,7 +46,6 @@ pub struct RequestMonitoring {
    pub(crate) auth_method: Option<AuthMethod>,
    success: bool,
    pub(crate) cold_start_info: ColdStartInfo,
-    pg_options: Option<StartupMessageParams>,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -104,7 +102,6 @@ impl RequestMonitoring {
            success: false,
            rejected: None,
            cold_start_info: ColdStartInfo::Unknown,
-            pg_options: None,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
@@ -135,18 +132,6 @@ impl RequestMonitoring {
        self.latency_timer.cold_start_info(info);
    }

-    pub fn set_db_options(&mut self, options: StartupMessageParams) {
-        self.set_application(options.get("application_name").map(SmolStr::from));
-        if let Some(user) = options.get("user") {
-            self.set_user(user.into());
-        }
-        if let Some(dbname) = options.get("database") {
-            self.set_dbname(dbname.into());
-        }
-
-        self.pg_options = Some(options);
-    }
-
    pub fn set_project(&mut self, x: MetricsAuxInfo) {
        if self.endpoint_id.is_none() {
            self.set_endpoint_id(x.endpoint_id.as_str().into())
@@ -170,10 +155,8 @@ impl RequestMonitoring {
        }
    }

-    fn set_application(&mut self, app: Option<SmolStr>) {
-        if let Some(app) = app {
-            self.application = Some(app);
-        }
+    pub fn set_application(&mut self, app: Option<SmolStr>) {
+        self.application = app.or_else(|| self.application.clone());
    }

    pub fn set_dbname(&mut self, dbname: DbName) {
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,9 +13,7 @@ use parquet::{
    },
    record::RecordWriter,
 };
-use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
-use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -89,7 +87,6 @@ pub struct RequestData {
    database: Option<String>,
    project: Option<String>,
    branch: Option<String>,
-    pg_options: Option<String>,
    auth_method: Option<&'static str>,
    error: Option<&'static str>,
    /// Success is counted if we form a HTTP response with sql rows inside
@@ -104,23 +101,6 @@ pub struct RequestData {
    disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }

-struct Options<'a> {
-    options: &'a StartupMessageParams,
-}
-
-impl<'a> serde::Serialize for Options<'a> {
-    fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        let mut state = s.serialize_map(None)?;
-        for (k, v) in self.options.iter() {
-            state.serialize_entry(k, v)?;
-        }
-        state.end()
-    }
-}
-
 impl From<&RequestMonitoring> for RequestData {
    fn from(value: &RequestMonitoring) -> Self {
        Self {
@@ -133,10 +113,6 @@ impl From<&RequestMonitoring> for RequestData {
            database: value.dbname.as_deref().map(String::from),
            project: value.project.as_deref().map(String::from),
            branch: value.branch.as_deref().map(String::from),
-            pg_options: value
-                .pg_options
-                .as_ref()
-                .and_then(|options| serde_json::to_string(&Options { options }).ok()),
            auth_method: value.auth_method.as_ref().map(|x| match x {
                super::AuthMethod::Web => "web",
                super::AuthMethod::ScramSha256 => "scram_sha_256",
@@ -518,7 +494,6 @@ mod tests {
            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
            project: Some(hex::encode(rng.gen::<[u8; 16]>())),
            branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
-            pg_options: None,
            auth_method: None,
            protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
            region: "us-east-1",
@@ -595,15 +570,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
            ]
        );

@@ -633,11 +608,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1223214, 5, 10000),
-                (1229364, 5, 10000),
-                (1231158, 5, 10000),
-                (1230520, 5, 10000),
-                (1221798, 5, 10000)
+                (1222212, 5, 10000),
+                (1228362, 5, 10000),
+                (1230156, 5, 10000),
+                (1229518, 5, 10000),
+                (1220796, 5, 10000)
            ]
        );

@@ -669,11 +644,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1208861, 5, 10000),
-                (1208592, 5, 10000),
-                (1208885, 5, 10000),
-                (1208873, 5, 10000),
-                (1209128, 5, 10000)
+                (1207859, 5, 10000),
+                (1207590, 5, 10000),
+                (1207883, 5, 10000),
+                (1207871, 5, 10000),
+                (1208126, 5, 10000)
            ]
        );

@@ -698,15 +673,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
            ]
        );

@@ -743,7 +718,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
+            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
        );

        tmpdir.close().unwrap();
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -3,7 +3,6 @@
 use std::convert::Infallible;

 use anyhow::{bail, Context};
-use intern::{EndpointIdInt, EndpointIdTag, InternId};
 use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -130,22 +129,20 @@ macro_rules! smol_str_wrapper {

 const POOLER_SUFFIX: &str = "-pooler";

-impl EndpointId {
+pub trait Normalize {
+    fn normalize(&self) -> Self;
+}
+
+impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
    fn normalize(&self) -> Self {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            stripped.into()
+        if self.as_ref().ends_with(POOLER_SUFFIX) {
+            let mut s = self.as_ref().to_string();
+            s.truncate(s.len() - POOLER_SUFFIX.len());
+            s.into()
        } else {
            self.clone()
        }
    }
-
-    fn normalize_intern(&self) -> EndpointIdInt {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            EndpointIdTag::get_interner().get_or_intern(stripped)
-        } else {
-            self.into()
-        }
-    }
 }

 // 90% of role name strings are 20 characters or less.
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -267,8 +267,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        };
    drop(pause);

-    ctx.set_db_options(params.clone());
-
    let hostname = mode.hostname(stream.get_ref());

    let common_names = tls.map(|tls| &tls.common_names);
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -3,7 +3,7 @@ use parking_lot::Mutex;
 use std::{pin::pin, sync::Arc, time::Duration};
 use tokio::{
    sync::Notify,
-    time::{error::Elapsed, Instant},
+    time::{error::Elapsed, timeout_at, Instant},
 };

 use self::aimd::Aimd;
@@ -80,7 +80,7 @@ pub struct LimiterInner {
 }

 impl LimiterInner {
-    fn update_limit(&mut self, latency: Duration, outcome: Option<Outcome>) {
+    fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
        if let Some(outcome) = outcome {
            let sample = Sample {
                latency,
@@ -92,12 +92,12 @@ impl LimiterInner {
    }

    fn take(&mut self, ready: &Notify) -> Option<()> {
-        if self.available >= 1 {
+        if self.available > 1 {
            self.available -= 1;
            self.in_flight += 1;

            // tell the next in the queue that there is a permit ready
-            if self.available >= 1 {
+            if self.available > 1 {
                ready.notify_one();
            }
            Some(())
@@ -157,12 +157,16 @@ impl DynamicLimiter {
    }

    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
+    ///
+    /// Returns `None` if there are none available after `duration`.
    pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
-        tokio::time::timeout(duration, self.acquire()).await?
+        self.acquire_deadline(Instant::now() + duration).await
    }

-    /// Try to acquire a concurrency [Token].
-    async fn acquire(self: &Arc<Self>) -> Result<Token, Elapsed> {
+    /// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
+    ///
+    /// Returns `None` if there are none available after `deadline`.
+    pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
        if self.config.initial_limit == 0 {
            // If the rate limiter is disabled, we can always acquire a token.
            Ok(Token::disabled())
@@ -170,16 +174,22 @@ impl DynamicLimiter {
            let mut notified = pin!(self.ready.notified());
            let mut ready = notified.as_mut().enable();
            loop {
+                let mut limit = None;
                if ready {
                    let mut inner = self.inner.lock();
                    if inner.take(&self.ready).is_some() {
                        break Ok(Token::new(self.clone()));
-                    } else {
-                        notified.set(self.ready.notified());
+                    }
+                    limit = Some(inner.limit);
+                }
+                match timeout_at(deadline, notified.as_mut()).await {
+                    Ok(()) => ready = true,
+                    Err(e) => {
+                        let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
+                        tracing::info!(limit, "could not acquire token in time");
+                        break Err(e);
                    }
                }
-                notified.as_mut().await;
-                ready = true;
            }
        }
    }
@@ -198,14 +208,14 @@ impl DynamicLimiter {

        let mut inner = self.inner.lock();

-        inner.update_limit(start.elapsed(), outcome);
-
-        inner.in_flight -= 1;
+        inner.update(start.elapsed(), outcome);
        if inner.in_flight < inner.limit {
            inner.available = inner.limit - inner.in_flight;
            // At least 1 permit is now available
            self.ready.notify_one();
        }
+
+        inner.in_flight -= 1;
    }

    /// The current state of the limiter.
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -51,9 +51,7 @@ impl LimitAlgorithm for Aimd {
                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
                let limit = limit.floor() as usize;

-                let limit = limit.clamp(self.min, self.max);
-                tracing::info!(limit, "limit decreased");
-                limit
+                limit.clamp(self.min, self.max)
            }
        }
    }
@@ -69,53 +67,6 @@ mod tests {

    use super::*;

-    #[tokio::test(start_paused = true)]
-    async fn increase_decrease() {
-        let config = RateLimiterConfig {
-            initial_limit: 1,
-            algorithm: RateLimitAlgorithm::Aimd {
-                conf: Aimd {
-                    min: 1,
-                    max: 2,
-                    inc: 10,
-                    dec: 0.5,
-                    utilisation: 0.8,
-                },
-            },
-        };
-
-        let limiter = DynamicLimiter::new(config);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        token.release(Outcome::Success);
-
-        assert_eq!(limiter.state().limit(), 2);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        token.release(Outcome::Success);
-        assert_eq!(limiter.state().limit(), 2);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        token.release(Outcome::Overload);
-        assert_eq!(limiter.state().limit(), 1);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        token.release(Outcome::Overload);
-        assert_eq!(limiter.state().limit(), 1);
-    }
-
    #[tokio::test(start_paused = true)]
    async fn should_decrease_limit_on_overload() {
        let config = RateLimiterConfig {
@@ -134,7 +85,7 @@ mod tests {
        let limiter = DynamicLimiter::new(config);

        let token = limiter
-            .acquire_timeout(Duration::from_millis(100))
+            .acquire_timeout(Duration::from_millis(1))
            .await
            .unwrap();
        token.release(Outcome::Overload);
@@ -142,41 +93,6 @@ mod tests {
        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
    }

-    #[tokio::test(start_paused = true)]
-    async fn acquire_timeout_times_out() {
-        let config = RateLimiterConfig {
-            initial_limit: 1,
-            algorithm: RateLimitAlgorithm::Aimd {
-                conf: Aimd {
-                    min: 1,
-                    max: 2,
-                    inc: 10,
-                    dec: 0.5,
-                    utilisation: 0.8,
-                },
-            },
-        };
-
-        let limiter = DynamicLimiter::new(config);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        let now = tokio::time::Instant::now();
-        limiter
-            .acquire_timeout(Duration::from_secs(1))
-            .await
-            .err()
-            .unwrap();
-
-        assert!(now.elapsed() >= Duration::from_secs(1));
-
-        token.release(Outcome::Success);
-
-        assert_eq!(limiter.state().limit(), 2);
-    }
-
    #[tokio::test(start_paused = true)]
    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
        let config = RateLimiterConfig {
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,7 +17,6 @@ use hyper1::http::HeaderValue;
 use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
-use pq_proto::StartupMessageParamsBuilder;
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -193,13 +192,13 @@ fn get_conn_info(

    let mut options = Option::None;

-    let mut params = StartupMessageParamsBuilder::default();
-    params.insert("user", &username);
-    params.insert("database", &dbname);
    for (key, value) in pairs {
-        params.insert(&key, &value);
-        if key == "options" {
-            options = Some(NeonOptions::parse_options_raw(&value));
+        match &*key {
+            "options" => {
+                options = Some(NeonOptions::parse_options_raw(&value));
+            }
+            "application_name" => ctx.set_application(Some(value.into())),
+            _ => {}
        }
    }

--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -11,7 +11,6 @@ either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
 hex.workspace = true
-humantime.workspace = true
 thiserror.workspace = true
 rand.workspace = true
 bytes.workspace = true
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,7 +1,7 @@
 use std::collections::{HashMap, HashSet};

 use anyhow::Context;
-use aws_sdk_s3::Client;
+use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
@@ -70,7 +70,7 @@ pub(crate) fn branch_cleanup_and_check_errors(

    match s3_data {
        Some(s3_data) => {
-            result.garbage_keys.extend(s3_data.unknown_keys);
+            result.garbage_keys.extend(s3_data.keys_to_remove);

            match s3_data.blob_data {
                BlobDataParseResult::Parsed {
@@ -93,12 +93,12 @@ pub(crate) fn branch_cleanup_and_check_errors(
                    }

                    if index_part.metadata.disk_consistent_lsn()
-                        != index_part.duplicated_disk_consistent_lsn()
+                        != index_part.get_disk_consistent_lsn()
                    {
                        result.errors.push(format!(
                            "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
                            index_part.metadata.disk_consistent_lsn(),
-                            index_part.duplicated_disk_consistent_lsn(),
+                            index_part.get_disk_consistent_lsn(),
                        ))
                    }

@@ -240,12 +240,7 @@ impl TenantObjectListing {
 #[derive(Debug)]
 pub(crate) struct S3TimelineBlobData {
    pub(crate) blob_data: BlobDataParseResult,
-
-    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
-    pub(crate) unused_index_keys: Vec<String>,
-
-    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
-    pub(crate) unknown_keys: Vec<String>,
+    pub(crate) keys_to_remove: Vec<String>,
 }

 #[derive(Debug)]
@@ -281,12 +276,12 @@ pub(crate) async fn list_timeline_blobs(
    let mut s3_layers = HashSet::new();

    let mut errors = Vec::new();
-    let mut unknown_keys = Vec::new();
+    let mut keys_to_remove = Vec::new();

    let mut timeline_dir_target = s3_root.timeline_root(&id);
    timeline_dir_target.delimiter = String::new();

-    let mut index_part_keys: Vec<String> = Vec::new();
+    let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
    let mut initdb_archive: bool = false;

    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
@@ -297,16 +292,16 @@ pub(crate) async fn list_timeline_blobs(
        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
        match blob_name {
            Some(name) if name.starts_with("index_part.json") => {
-                tracing::debug!("Index key {key}");
-                index_part_keys.push(key.to_owned())
+                tracing::info!("Index key {key}");
+                index_parts.push(obj)
            }
            Some("initdb.tar.zst") => {
-                tracing::debug!("initdb archive {key}");
+                tracing::info!("initdb archive {key}");
                initdb_archive = true;
            }
            Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                Ok((new_layer, gen)) => {
-                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
+                    tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
                    s3_layers.insert((new_layer, gen));
                }
                Err(e) => {
@@ -314,37 +309,37 @@ pub(crate) async fn list_timeline_blobs(
                    errors.push(
                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                    );
-                    unknown_keys.push(key.to_string());
+                    keys_to_remove.push(key.to_string());
                }
            },
            None => {
-                tracing::warn!("Unknown key {}", key);
+                tracing::info!("Peculiar key {}", key);
                errors.push(format!("S3 list response got an object with odd key {key}"));
-                unknown_keys.push(key.to_string());
+                keys_to_remove.push(key.to_string());
            }
        }
    }

-    if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
-        tracing::debug!(
+    if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive {
+        tracing::info!(
            "Timeline is empty apart from initdb archive: expected post-deletion state."
        );
        return Ok(S3TimelineBlobData {
            blob_data: BlobDataParseResult::Relic,
-            unused_index_keys: index_part_keys,
-            unknown_keys: Vec::new(),
+            keys_to_remove: Vec::new(),
        });
    }

    // Choose the index_part with the highest generation
-    let (index_part_object, index_part_generation) = match index_part_keys
+    let (index_part_object, index_part_generation) = match index_parts
        .iter()
-        .filter_map(|key| {
+        .filter_map(|k| {
+            let key = k.key();
            // Stripping the index key to the last part, because RemotePath doesn't
            // like absolute paths, and depending on prefix_in_bucket it's possible
            // for the keys we read back to start with a slash.
            let basename = key.rsplit_once('/').unwrap().1;
-            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
+            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
        })
        .max_by_key(|i| i.1)
        .map(|(k, g)| (k.clone(), g))
@@ -352,18 +347,15 @@ pub(crate) async fn list_timeline_blobs(
        Some((key, gen)) => (Some(key), gen),
        None => {
            // Legacy/missing case: one or zero index parts, which did not have a generation
-            (index_part_keys.pop(), Generation::none())
+            (index_parts.pop(), Generation::none())
        }
    };

-    match index_part_object.as_ref() {
-        Some(selected) => index_part_keys.retain(|k| k != selected),
-        None => {
-            errors.push("S3 list response got no index_part.json file".to_string());
-        }
+    if index_part_object.is_none() {
+        errors.push("S3 list response got no index_part.json file".to_string());
    }

-    if let Some(index_part_object_key) = index_part_object.as_ref() {
+    if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) {
        let index_part_bytes = download_object_with_retries(
            s3_client,
            &timeline_dir_target.bucket_name,
@@ -380,14 +372,17 @@ pub(crate) async fn list_timeline_blobs(
                        index_part_generation,
                        s3_layers,
                    },
-                    unused_index_keys: index_part_keys,
-                    unknown_keys,
+                    keys_to_remove,
                })
            }
            Err(index_parse_error) => errors.push(format!(
                "index_part.json body parsing error: {index_parse_error}"
            )),
        }
+    } else {
+        errors.push(format!(
+            "Index part object {index_part_object:?} has no key"
+        ));
    }

    if errors.is_empty() {
@@ -398,7 +393,6 @@ pub(crate) async fn list_timeline_blobs(

    Ok(S3TimelineBlobData {
        blob_data: BlobDataParseResult::Incorrect(errors),
-        unused_index_keys: index_part_keys,
-        unknown_keys,
+        keys_to_remove,
    })
 }
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,7 +4,6 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
-pub mod pageserver_physical_gc;
 pub mod scan_pageserver_metadata;
 pub mod scan_safekeeper_metadata;
 pub mod tenant_snapshot;
@@ -397,7 +396,7 @@ async fn download_object_with_retries(
            .await
        {
            Ok(bytes_read) => {
-                tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
+                tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}");
                return Ok(body_buf);
            }
            Err(e) => {
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -2,13 +2,11 @@ use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::pageserver_physical_gc::GcMode;
 use s3_scrubber::scan_pageserver_metadata::scan_metadata;
 use s3_scrubber::tenant_snapshot::SnapshotDownloader;
 use s3_scrubber::{
-    init_logging, pageserver_physical_gc::pageserver_physical_gc,
-    scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
-    TraversingDepth,
+    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
+    NodeKind, TraversingDepth,
 };

 use clap::{Parser, Subcommand};
@@ -64,14 +62,6 @@ enum Command {
        #[arg(short, long)]
        output_path: Utf8PathBuf,
    },
-    PageserverPhysicalGc {
-        #[arg(long = "tenant-id", num_args = 0..)]
-        tenant_ids: Vec<TenantShardId>,
-        #[arg(long = "min-age")]
-        min_age: humantime::Duration,
-        #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
-        mode: GcMode,
-    },
 }

 #[tokio::main]
@@ -85,7 +75,6 @@ async fn main() -> anyhow::Result<()> {
        Command::FindGarbage { .. } => "find-garbage",
        Command::PurgeGarbage { .. } => "purge-garbage",
        Command::TenantSnapshot { .. } => "tenant-snapshot",
-        Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
    };
    let _guard = init_logging(&format!(
        "{}_{}_{}_{}.log",
@@ -189,15 +178,5 @@ async fn main() -> anyhow::Result<()> {
                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
            downloader.download().await
        }
-        Command::PageserverPhysicalGc {
-            tenant_ids,
-            min_age,
-            mode,
-        } => {
-            let summary =
-                pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
-            println!("{}", serde_json::to_string(&summary).unwrap());
-            Ok(())
-        }
    }
 }
--- a/s3_scrubber/src/pageserver_physical_gc.rs
+++ b/s3_scrubber/src/pageserver_physical_gc.rs
@@ -1,239 +0,0 @@
-use std::time::{Duration, UNIX_EPOCH};
-
-use crate::checks::{list_timeline_blobs, BlobDataParseResult};
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
-use aws_sdk_s3::Client;
-use futures_util::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
-use pageserver::tenant::IndexPart;
-use pageserver_api::shard::TenantShardId;
-use remote_storage::RemotePath;
-use serde::Serialize;
-use tracing::{info_span, Instrument};
-use utils::generation::Generation;
-
-#[derive(Serialize, Default)]
-pub struct GcSummary {
-    indices_deleted: usize,
-    remote_storage_errors: usize,
-}
-
-#[derive(clap::ValueEnum, Debug, Clone, Copy)]
-pub enum GcMode {
-    // Delete nothing
-    DryRun,
-
-    // Enable only removing old-generation indices
-    IndicesOnly,
-    // Enable all forms of GC
-    // TODO: this will be used when shard split ancestor layer deletion is added
-    // All,
-}
-
-impl std::fmt::Display for GcMode {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            GcMode::DryRun => write!(f, "dry-run"),
-            GcMode::IndicesOnly => write!(f, "indices-only"),
-        }
-    }
-}
-
-async fn maybe_delete_index(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
-    min_age: &Duration,
-    latest_gen: Generation,
-    key: &str,
-    mode: GcMode,
-    summary: &mut GcSummary,
-) {
-    // Validation: we will only delete things that parse cleanly
-    let basename = key.rsplit_once('/').unwrap().1;
-    let candidate_generation =
-        match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) {
-            Some(g) => g,
-            None => {
-                if basename == IndexPart::FILE_NAME {
-                    // A legacy pre-generation index
-                    Generation::none()
-                } else {
-                    // A strange key: we will not delete this because we don't understand it.
-                    tracing::warn!("Bad index key");
-                    return;
-                }
-            }
-        };
-
-    // Validation: we will only delete indices more than one generation old, to avoid interfering
-    // in typical migrations, even if they are very long running.
-    if candidate_generation >= latest_gen {
-        // This shouldn't happen: when we loaded metadata, it should have selected the latest
-        // generation already, and only populated [`S3TimelineBlobData::unused_index_keys`]
-        // with older generations.
-        tracing::warn!("Deletion candidate is >= latest generation, this is a bug!");
-        return;
-    } else if candidate_generation.next() == latest_gen {
-        // Skip deleting the latest-1th generation's index.
-        return;
-    }
-
-    // Validation: we will only delete indices after one week, so that during incidents we will have
-    // easy access to recent indices.
-    let age: Duration = match s3_client
-        .head_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(response) => match response.last_modified {
-            None => {
-                tracing::warn!("Missing last_modified");
-                summary.remote_storage_errors += 1;
-                return;
-            }
-            Some(last_modified) => {
-                let last_modified =
-                    UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
-                match last_modified.elapsed() {
-                    Ok(e) => e,
-                    Err(_) => {
-                        tracing::warn!("Bad last_modified time: {last_modified:?}");
-                        return;
-                    }
-                }
-            }
-        },
-        Err(e) => {
-            tracing::warn!("Failed to HEAD {key}: {e}");
-            summary.remote_storage_errors += 1;
-            return;
-        }
-    };
-    if &age < min_age {
-        tracing::info!(
-            "Skipping young object {} < {}",
-            age.as_secs_f64(),
-            min_age.as_secs_f64()
-        );
-        return;
-    }
-
-    if matches!(mode, GcMode::DryRun) {
-        tracing::info!("Dry run: would delete this key");
-        return;
-    }
-
-    // All validations passed: erase the object
-    match s3_client
-        .delete_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(_) => {
-            tracing::info!("Successfully deleted index");
-            summary.indices_deleted += 1;
-        }
-        Err(e) => {
-            tracing::warn!("Failed to delete index: {e}");
-            summary.remote_storage_errors += 1;
-        }
-    }
-}
-
-/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
-/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
-/// is about removing:
-/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
-///   uploading a layer and uploading an index)
-/// - Index objects from historic generations
-///
-/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
-/// make sure that object listings don't get slowed down by large numbers of garbage objects.
-pub async fn pageserver_physical_gc(
-    bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantShardId>,
-    min_age: Duration,
-    mode: GcMode,
-) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
-
-    let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
-    } else {
-        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
-    };
-
-    // How many tenants to process in parallel.  We need to be mindful of pageservers
-    // accessing the same per tenant prefixes, so use a lower setting than pageservers.
-    const CONCURRENCY: usize = 32;
-
-    // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
-    let timelines = timelines.try_buffered(CONCURRENCY);
-    let timelines = timelines.try_flatten();
-
-    // Generate a stream of S3TimelineBlobData
-    async fn gc_timeline(
-        s3_client: &Client,
-        bucket_config: &BucketConfig,
-        min_age: &Duration,
-        target: &RootTarget,
-        mode: GcMode,
-        ttid: TenantShardTimelineId,
-    ) -> anyhow::Result<GcSummary> {
-        let mut summary = GcSummary::default();
-        let data = list_timeline_blobs(s3_client, ttid, target).await?;
-
-        let (latest_gen, candidates) = match &data.blob_data {
-            BlobDataParseResult::Parsed {
-                index_part: _index_part,
-                index_part_generation,
-                s3_layers: _s3_layers,
-            } => (*index_part_generation, data.unused_index_keys),
-            BlobDataParseResult::Relic => {
-                // Post-deletion tenant location: don't try and GC it.
-                return Ok(summary);
-            }
-            BlobDataParseResult::Incorrect(reasons) => {
-                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
-                return Ok(summary);
-            }
-        };
-
-        for key in candidates {
-            maybe_delete_index(
-                s3_client,
-                bucket_config,
-                min_age,
-                latest_gen,
-                &key,
-                mode,
-                &mut summary,
-            )
-            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key))
-            .await;
-        }
-
-        Ok(summary)
-    }
-    let timelines = timelines
-        .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
-    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
-
-    let mut summary = GcSummary::default();
-
-    while let Some(i) = timelines.next().await {
-        let tl_summary = i?;
-
-        summary.indices_deleted += tl_summary.indices_deleted;
-        summary.remote_storage_errors += tl_summary.remote_storage_errors;
-    }
-
-    Ok(summary)
-}
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -29,12 +29,13 @@ use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::http;
+use safekeeper::remove_wal;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
+use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -440,6 +441,14 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .map(|res| ("broker main".to_owned(), res));
    tasks_handles.push(Box::pin(broker_task_handle));

+    let conf_ = conf.clone();
+    let wal_remover_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
+        .spawn(remove_wal::task_main(conf_))
+        .map(|res| ("WAL remover".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_remover_handle));
+
    set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // TODO: update tokio-stream, convert to real async Stream with
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,7 +2,7 @@

 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8PathBuf;
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -12,9 +12,9 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;

+use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::TimelinePersistentState;
-use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};

 use crate::SafeKeeperConf;
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
 pub struct FileStorage {
    // save timeline dir to avoid reconstructing it every time
    timeline_dir: Utf8PathBuf,
-    no_sync: bool,
+    conf: SafeKeeperConf,

    /// Last state persisted to disk.
    state: TimelinePersistentState,
@@ -54,12 +54,13 @@ pub struct FileStorage {
 impl FileStorage {
    /// Initialize storage by loading state from disk.
    pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        let state = Self::load_control_file_from_dir(&timeline_dir)?;
+        let timeline_dir = conf.timeline_dir(ttid);
+
+        let state = Self::load_control_file_conf(conf, ttid)?;

        Ok(FileStorage {
            timeline_dir,
-            no_sync: conf.no_sync,
+            conf: conf.clone(),
            state,
            last_persist_at: Instant::now(),
        })
@@ -73,7 +74,7 @@ impl FileStorage {
    ) -> Result<FileStorage> {
        let store = FileStorage {
            timeline_dir,
-            no_sync: conf.no_sync,
+            conf: conf.clone(),
            state,
            last_persist_at: Instant::now(),
        };
@@ -101,9 +102,12 @@ impl FileStorage {
        upgrade_control_file(buf, version)
    }

-    /// Load control file from given directory.
-    pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
-        let path = timeline_dir.join(CONTROL_FILE_NAME);
+    /// Load control file for given ttid at path specified by conf.
+    pub fn load_control_file_conf(
+        conf: &SafeKeeperConf,
+        ttid: &TenantTimelineId,
+    ) -> Result<TimelinePersistentState> {
+        let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
        Self::load_control_file(path)
    }

@@ -199,7 +203,7 @@ impl Storage for FileStorage {
        })?;

        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-        durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
+        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;

        // update internal state
        self.state = s.clone();
@@ -229,13 +233,12 @@ mod test {
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        Ok((
            FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_from_dir(&timeline_dir)?,
+            FileStorage::load_control_file_conf(conf, ttid)?,
        ))
    }

@@ -243,11 +246,11 @@ mod test {
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        let state = TimelinePersistentState::empty();
+        let timeline_dir = conf.timeline_dir(ttid);
        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
        Ok((storage, state))
    }
@@ -288,7 +291,7 @@ mod test {
                .await
                .expect("failed to persist state");
        }
-        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
+        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
        let mut data = fs::read(&control_path).await.unwrap();
        data[0] += 1; // change the first byte of the file to fail checksum validation
        fs::write(&control_path, &data)
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -15,10 +15,10 @@ use crate::{
    control_file::{FileStorage, Storage},
    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
    state::TimelinePersistentState,
-    timeline::{FullAccessTimeline, Timeline, TimelineError},
+    timeline::{Timeline, TimelineError},
    wal_backup::copy_s3_segments,
    wal_storage::{wal_file_paths, WalReader},
-    GlobalTimelines,
+    GlobalTimelines, SafeKeeperConf,
 };

 // we don't want to have more than 10 segments on disk after copy, because they take space
@@ -46,14 +46,12 @@ pub async fn handle_request(request: Request) -> Result<()> {
        }
    }

-    let source_tli = request.source.full_access_guard().await?;
-
    let conf = &GlobalTimelines::get_global_config();
    let ttid = request.destination_ttid;

    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;

-    let (mem_state, state) = source_tli.get_state().await;
+    let (mem_state, state) = request.source.get_state().await;
    let start_lsn = state.timeline_start_lsn;
    if start_lsn == Lsn::INVALID {
        bail!("timeline is not initialized");
@@ -62,7 +60,7 @@ pub async fn handle_request(request: Request) -> Result<()> {

    {
        let commit_lsn = mem_state.commit_lsn;
-        let flush_lsn = source_tli.get_flush_lsn().await;
+        let flush_lsn = request.source.get_flush_lsn().await;

        info!(
            "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
@@ -129,8 +127,10 @@ pub async fn handle_request(request: Request) -> Result<()> {
    .await?;

    copy_disk_segments(
-        &source_tli,
+        conf,
+        &state,
        wal_seg_size,
+        &request.source.ttid,
        new_backup_lsn,
        request.until_lsn,
        &tli_dir_path,
@@ -159,13 +159,21 @@ pub async fn handle_request(request: Request) -> Result<()> {
 }

 async fn copy_disk_segments(
-    tli: &FullAccessTimeline,
+    conf: &SafeKeeperConf,
+    persisted_state: &TimelinePersistentState,
    wal_seg_size: usize,
+    source_ttid: &TenantTimelineId,
    start_lsn: Lsn,
    end_lsn: Lsn,
    tli_dir_path: &Utf8PathBuf,
 ) -> Result<()> {
-    let mut wal_reader = tli.get_walreader(start_lsn).await?;
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        conf.timeline_dir(source_ttid),
+        persisted_state,
+        start_lsn,
+        true,
+    )?;

    let mut buf = [0u8; MAX_SEND_SIZE];

--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -10,7 +10,6 @@ use std::sync::Arc;
 use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
-use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
@@ -27,8 +26,7 @@ use crate::safekeeper::TermHistory;
 use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
-use crate::timeline::get_timeline_dir;
-use crate::timeline::FullAccessTimeline;
+use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;

@@ -70,7 +68,6 @@ pub struct Response {
 pub struct TimelineDumpSer {
    pub tli: Arc<crate::timeline::Timeline>,
    pub args: Args,
-    pub timeline_dir: Utf8PathBuf,
    pub runtime: Arc<tokio::runtime::Runtime>,
 }

@@ -88,20 +85,14 @@ impl Serialize for TimelineDumpSer {
    where
        S: serde::Serializer,
    {
-        let dump = self.runtime.block_on(build_from_tli_dump(
-            &self.tli,
-            &self.args,
-            &self.timeline_dir,
-        ));
+        let dump = self
+            .runtime
+            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
        dump.serialize(serializer)
    }
 }

-async fn build_from_tli_dump(
-    timeline: &Arc<crate::timeline::Timeline>,
-    args: &Args,
-    timeline_dir: &Utf8Path,
-) -> Timeline {
+async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
    let control_file = if args.dump_control_file {
        let mut state = timeline.get_state().await.1;
        if !args.dump_term_history {
@@ -121,8 +112,7 @@ async fn build_from_tli_dump(
    let disk_content = if args.dump_disk_content {
        // build_disk_content can fail, but we don't want to fail the whole
        // request because of that.
-        // Note: timeline can be in offloaded state, this is not a problem.
-        build_disk_content(timeline_dir).ok()
+        build_disk_content(&timeline.timeline_dir).ok()
    } else {
        None
    };
@@ -196,7 +186,6 @@ pub struct FileInfo {
 pub async fn build(args: Args) -> Result<Response> {
    let start_time = Utc::now();
    let timelines_count = GlobalTimelines::timelines_count();
-    let config = GlobalTimelines::get_global_config();

    let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
        // If both tenant_id and timeline_id are specified, we can just get the
@@ -234,11 +223,12 @@ pub async fn build(args: Args) -> Result<Response> {
        timelines.push(TimelineDumpSer {
            tli,
            args: args.clone(),
-            timeline_dir: get_timeline_dir(&config, &ttid),
            runtime: runtime.clone(),
        });
    }

+    let config = GlobalTimelines::get_global_config();
+
    Ok(Response {
        start_time,
        finish_time: Utc::now(),
@@ -326,19 +316,27 @@ pub struct TimelineDigest {
 }

 pub async fn calculate_digest(
-    tli: &FullAccessTimeline,
+    tli: &Arc<crate::timeline::Timeline>,
    request: TimelineDigestRequest,
 ) -> Result<TimelineDigest> {
    if request.from_lsn > request.until_lsn {
        bail!("from_lsn is greater than until_lsn");
    }

+    let conf = GlobalTimelines::get_global_config();
    let (_, persisted_state) = tli.get_state().await;
+
    if persisted_state.timeline_start_lsn > request.from_lsn {
        bail!("requested LSN is before the start of the timeline");
    }

-    let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        tli.timeline_dir.clone(),
+        &persisted_state,
+        request.from_lsn,
+        true,
+    )?;

    let mut hasher = Sha256::new();
    let mut buf = [0u8; MAX_SEND_SIZE];
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
    }
 }

-/// Augment AcceptorState with last_log_term for convenience
+/// Augment AcceptorState with epoch for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
    pub term: Term,
-    pub epoch: Term, // aka last_log_term
+    pub epoch: Term,
    pub term_history: Vec<TermSwitchApiEntry>,
 }

@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
    let (inmem, state) = tli.get_state().await;
    let flush_lsn = tli.get_flush_lsn().await;

-    let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
+    let epoch = state.acceptor_state.get_epoch(flush_lsn);
    let term_history = state
        .acceptor_state
        .term_history
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        .collect();
    let acc_state = AcceptorStateStatus {
        term: state.acceptor_state.term,
-        epoch: last_log_term,
+        epoch,
        term_history,
    };

@@ -249,10 +249,6 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
    };

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let tli = tli
-        .full_access_guard()
-        .await
-        .map_err(ApiError::InternalServerError)?;

    let response = debug_dump::calculate_digest(&tli, request)
        .await
@@ -272,12 +268,8 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
    let filename: String = parse_request_param(&request, "filename")?;

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let tli = tli
-        .full_access_guard()
-        .await
-        .map_err(ApiError::InternalServerError)?;

-    let filepath = tli.get_timeline_dir().join(filename);
+    let filepath = tli.timeline_dir.join(filename);
    let mut file = File::open(&filepath)
        .await
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -295,7 +287,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
        .map_err(|e| ApiError::InternalServerError(e.into()))
 }

-/// Force persist control file.
+/// Force persist control file and remove old WAL.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

@@ -305,13 +297,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    );

    let tli = GlobalTimelines::get(ttid)?;
-    tli.write_shared_state()
-        .await
-        .sk
-        .state
-        .flush()
+    tli.maybe_persist_control_file(true)
        .await
        .map_err(ApiError::InternalServerError)?;
+    tli.remove_old_wal()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
    json_response(StatusCode::OK, ())
 }

--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -6,6 +6,8 @@
 //! modifications in tests.
 //!

+use std::sync::Arc;
+
 use anyhow::Context;
 use bytes::Bytes;
 use postgres_backend::QueryError;
@@ -21,7 +23,7 @@ use crate::safekeeper::{
 };
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::Timeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
@@ -102,8 +104,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn prepare_safekeeper(
    ttid: TenantTimelineId,
    pg_version: u32,
-) -> anyhow::Result<FullAccessTimeline> {
-    let tli = GlobalTimelines::create(
+) -> anyhow::Result<Arc<Timeline>> {
+    GlobalTimelines::create(
        ttid,
        ServerInfo {
            pg_version,
@@ -113,16 +115,10 @@ async fn prepare_safekeeper(
        Lsn::INVALID,
        Lsn::INVALID,
    )
-    .await?;
-
-    tli.full_access_guard().await
+    .await
 }

-async fn send_proposer_elected(
-    tli: &FullAccessTimeline,
-    term: Term,
-    lsn: Lsn,
-) -> anyhow::Result<()> {
+async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
    // add new term to existing history
    let history = tli.get_state().await.1.acceptor_state.term_history;
    let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -151,7 +147,7 @@ pub struct InsertedWAL {
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
 pub async fn append_logical_message(
-    tli: &FullAccessTimeline,
+    tli: &Arc<Timeline>,
    msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
    let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
@@ -169,7 +165,7 @@ pub async fn append_logical_message(
    let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
        h: AppendRequestHeader {
            term: msg.term,
-            term_start_lsn: begin_lsn,
+            epoch_start_lsn: begin_lsn,
            begin_lsn,
            end_lsn,
            commit_lsn,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,7 +7,10 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;

-use utils::{auth::SwappableJwtAuth, id::NodeId};
+use utils::{
+    auth::SwappableJwtAuth,
+    id::{NodeId, TenantId, TenantTimelineId},
+};

 mod auth;
 pub mod broker;
@@ -86,6 +89,15 @@ pub struct SafeKeeperConf {
 }

 impl SafeKeeperConf {
+    pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+        self.workdir.join(tenant_id.to_string())
+    }
+
+    pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
+        self.tenant_dir(&ttid.tenant_id)
+            .join(ttid.timeline_id.to_string())
+    }
+
    pub fn is_wal_backup_enabled(&self) -> bool {
        self.remote_storage.is_some() && self.wal_backup_enabled
    }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -12,8 +12,8 @@ use metrics::{
    core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
    proto::MetricFamily,
    register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
-    register_int_counter_vec, register_int_gauge, Gauge, IntCounter, IntCounterPair,
-    IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
+    IntGaugeVec,
 };
 use once_cell::sync::Lazy;

@@ -163,13 +163,6 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
 });
-pub static MANAGERS_RUNNING: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "safekeeper_managers_running",
-        "Number of timeline managers running. Should match safekeeper_timelines minus safekeeper_timelines_cancelled."
-    )
-    .expect("failed to define a metric")
-});
 pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "safekeeper_manager_iterations_total",
@@ -392,9 +385,7 @@ pub struct TimelineCollector {
    flushed_wal_seconds: GaugeVec,
    collect_timeline_metrics: Gauge,
    timelines_count: IntGauge,
-    timelines_cancelled_count: IntGauge,
    active_timelines_count: IntGauge,
-    active_timelines_set_size: IntGauge,
 }

 impl Default for TimelineCollector {
@@ -582,18 +573,11 @@ impl TimelineCollector {

        let timelines_count = IntGauge::new(
            "safekeeper_timelines",
-            "Total number of timelines loaded in-memory, including cancelled (deleted) ones.",
+            "Total number of timelines loaded in-memory",
        )
        .unwrap();
        descs.extend(timelines_count.desc().into_iter().cloned());

-        let timelines_cancelled_count = IntGauge::new(
-            "safekeeper_timelines_cancelled",
-            "Number of cancelled timelines loaded in-memory",
-        )
-        .unwrap();
-        descs.extend(timelines_cancelled_count.desc().into_iter().cloned());
-
        let active_timelines_count = IntGauge::new(
            "safekeeper_active_timelines",
            "Total number of active timelines",
@@ -601,13 +585,6 @@ impl TimelineCollector {
        .unwrap();
        descs.extend(active_timelines_count.desc().into_iter().cloned());

-        let active_timelines_set_size = IntGauge::new(
-            "safekeeper_active_timelines_set_size",
-            "Size of the active timelines hashset. Should match safekeeper_active_timelines metric.",
-        )
-        .unwrap();
-        descs.extend(active_timelines_set_size.desc().into_iter().cloned());
-
        TimelineCollector {
            descs,
            commit_lsn,
@@ -629,9 +606,7 @@ impl TimelineCollector {
            flushed_wal_seconds,
            collect_timeline_metrics,
            timelines_count,
-            timelines_cancelled_count,
            active_timelines_count,
-            active_timelines_set_size,
        }
    }
 }
@@ -784,19 +759,10 @@ impl Collector for TimelineCollector {
        self.timelines_count.set(timelines_count as i64);
        mfs.extend(self.timelines_count.collect());

-        // report number of cancelled timelines
-        self.timelines_cancelled_count
-            .set(GlobalTimelines::get_num_cancelled() as i64);
-        mfs.extend(self.timelines_cancelled_count.collect());
-
        self.active_timelines_count
            .set(active_timelines_count as i64);
        mfs.extend(self.active_timelines_count.collect());

-        self.active_timelines_set_size
-            .set(GlobalTimelines::get_global_broker_active_set().get_len() as i64);
-        mfs.extend(self.active_timelines_set_size.collect());
-
        mfs
    }
 }
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -17,7 +17,7 @@ use utils::{
 use crate::{
    control_file, debug_dump,
    http::routes::TimelineStatus,
-    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
+    timeline::{Timeline, TimelineError},
    wal_storage::{self, Storage},
    GlobalTimelines, SafeKeeperConf,
 };
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
    }

    // Move timeline dir to the correct location
-    let timeline_path = get_timeline_dir(conf, &ttid);
+    let timeline_path = conf.timeline_dir(&ttid);

    info!(
        "moving timeline {} from {} to {}",
        ttid, tmp_path, timeline_path
    );
-    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
+    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
    tokio::fs::rename(tmp_path, &timeline_path).await?;

    let tli = GlobalTimelines::load_timeline(&guard, ttid)
--- a/Show More
+++ b/Show More