feat(pageserver): support multiple key ranges for image initial flush path

Signed-off-by: Alex Chi Z <chi@neon.tech>
2026-02-23 04:20:37 +00:00 · 2024-05-23 11:51:14 -04:00
241 changed files with 4048 additions and 11163 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -8,7 +8,6 @@
 !scripts/combine_control_files.py
 !scripts/ninstall.sh
 !vm-cgconfig.conf
-!docker-compose/run-tests.sh

 # Directories
 !.cargo/
@@ -21,7 +20,7 @@
 !patches/
 !pgxn/
 !proxy/
-!storage_scrubber/
+!s3_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -24,7 +24,7 @@ jobs:

  actionlint:
    needs: [ check-permissions ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: reviewdog/action-actionlint@v1
@@ -36,15 +36,3 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
-      - run: |
-          PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows
-          then
-            grep -ERl $PAT .github/workflows |\
-            while read -r f
-            do
-              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
-            done
-            exit 1
-          fi
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -44,7 +44,7 @@ jobs:
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -69,41 +69,15 @@ jobs:
        with:
          ref: main
          token: ${{ secrets.CI_ACCESS_TOKEN }}
-      
-      - name: Look for existing PR
-        id: get-pr
-        env:
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
-          echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
-      
-      - name: Get changed labels
-        id: get-labels
-        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-        env:
-          ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \
-          <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
-          ( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -)
-          LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \
-          <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' |  ( grep -E '^run' || true ) | sort ) |\
-          paste -sd , -)
-          echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
-          echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}

      - run: gh pr checkout "${PR_NUMBER}"

      - run: git checkout -b "${BRANCH}"

      - run: git push --force origin "${BRANCH}"
-        if: steps.get-pr.outputs.ALREADY_CREATED == ''

      - name: Create a Pull Request for CI run (if required)
-        if: steps.get-pr.outputs.ALREADY_CREATED == ''
-        env: 
+        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          cat << EOF > body.md
@@ -114,33 +88,16 @@ jobs:
            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
          EOF

-          LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER}  --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft  )| \
-          grep -E '^run' | paste -sd , -)
-          gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
+          if [ -z "${ALREADY_CREATED}" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
                                                       --body-file "body.md" \
                                                       --head "${BRANCH}" \
                                                       --base "main" \
-                                                       --label ${LABELS} \
+                                                       --label "run-e2e-tests-in-draft" \
                                                       --draft
-      - name: Modify the existing pull request (if required)
-        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }}
-          LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }}
-          ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
-        run: |
-          ADD_CMD=
-          REMOVE_CMD=
-          [ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}"
-          [ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}"
-          if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then
-            gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD}
          fi

-      - run: git push --force origin "${BRANCH}"
-        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-             
  cleanup:
    # Close PRs and delete branchs if the original PR is closed.

@@ -152,7 +109,7 @@ jobs:
      github.event.action == 'closed' &&
      github.event.pull_request.head.repo.full_name != github.repository

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -38,11 +38,6 @@ on:
        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
        required: false
        default: false
-      run_only_pgvector_tests:
-        type: boolean
-        description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
-        required: false
-        default: false

 defaults:
  run:
@@ -55,7 +50,6 @@ concurrency:

 jobs:
  bench:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -126,7 +120,6 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  generate-matrices:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
@@ -137,7 +130,7 @@ jobs:
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
@@ -204,7 +197,6 @@ jobs:
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    needs: [ generate-matrices ]

    strategy:
@@ -351,92 +343,6 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-  pgbench-pgvector:
-    env:
-      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
-      TEST_PG_BENCH_SCALES_MATRIX: "1"
-      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
-      TEST_OUTPUT: /tmp/test_output
-      BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-captest-pgvector"
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
-    - name: Set up Connection String
-      id: set-up-connstr
-      run: |
-        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-        
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-
-        QUERIES=("SELECT version()")
-        QUERIES+=("SHOW neon.tenant_id")
-        QUERIES+=("SHOW neon.timeline_id")
-        
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
-    - name: Benchmark pgvector hnsw indexing
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_perf_olap.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
-    - name: Benchmark pgvector hnsw queries
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
-    - name: Create Allure report
-      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report-generate
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
-
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -445,7 +351,7 @@ jobs:
    #
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: ${{ !cancelled() }}
    needs: [ generate-matrices, pgbench-compare ]

    strategy:
@@ -549,7 +455,7 @@ jobs:
    # We might change it after https://github.com/neondatabase/neon/issues/2900.
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: ${{ !cancelled() }}
    needs: [ generate-matrices, clickbench-compare ]

    strategy:
@@ -651,7 +557,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: ${{ !cancelled() }}
    needs: [ generate-matrices, tpch-compare ]

    strategy:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -55,7 +55,7 @@ jobs:
            exit 1
          fi

-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3

      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
      # The default value is ~/.docker
@@ -88,7 +88,7 @@ jobs:

  merge-images:
    needs: [ build-image ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      IMAGE_TAG: ${{ inputs.image-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -35,7 +35,7 @@ jobs:
  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -549,7 +549,7 @@ jobs:
  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
    - uses: slackapi/slack-github-action@v1
@@ -774,7 +774,7 @@ jobs:

  neon-image:
    needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - uses: docker/login-action@v3
@@ -859,26 +859,6 @@ jobs:
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

-      - name: Build neon extensions test image
-        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            PG_VERSION=${{ matrix.version }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-node
-          target: neon-pg-ext-test
-          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
-          tags: |
-            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
-
      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
        if: matrix.version == 'v16'
@@ -904,7 +884,7 @@ jobs:

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    strategy:
      matrix:
@@ -922,13 +902,6 @@ jobs:
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64

-      - name: Create multi-arch neon-test-extensions image
-        if: matrix.version == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
-
      - name: Create multi-arch compute-tools image
        if: matrix.version == 'v16'
        run: |
@@ -965,7 +938,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1
        with:
          fetch-depth: 0

@@ -1047,7 +1020,7 @@ jobs:
            exit 1
          fi

-      - name: Verify docker-compose example and test extensions
+      - name: Verify docker-compose example
        timeout-minutes: 20
        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh

@@ -1059,7 +1032,7 @@ jobs:

  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      VERSIONS: v14 v15 v16
@@ -1101,12 +1074,10 @@ jobs:
                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
            done
          done
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
-                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -19,7 +19,7 @@ permissions: {}

 jobs:
  check-image:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    outputs:
      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
      found: ${{ steps.check-image.outputs.found }}
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -16,7 +16,7 @@ permissions: {}

 jobs:
  check-permissions:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
    - name: Disallow CI runs on PRs from forks
      if: |
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  cleanup:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - name: Cleanup
        run: |
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
  test-postgres-client-libs:
    # TODO: switch to gen2 runner, requires docker
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]

    env:
      DEFAULT_PG_VERSION: 14
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -26,7 +26,7 @@ permissions: {}

 jobs:
  tag-image:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      FROM_TAG: ${{ inputs.from-tag }}
--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -19,7 +19,7 @@ on:

 jobs:
  notify:
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]

    steps:
      - uses: neondatabase/dev-actions/release-pr-notify@main
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,7 +26,7 @@ defaults:
 jobs:
  create-storage-release-branch:
    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
@@ -53,7 +53,7 @@ jobs:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Storage & Compute release ${RELEASE_DATE}
+          ## Release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF
@@ -65,7 +65,7 @@ jobs:

  create-proxy-release-branch:
    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
  cancel-previous-e2e-tests:
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"

  tag:
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]
    outputs:
      build-tag: ${{ steps.build-tag.outputs.tag }}

@@ -62,7 +62,7 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -776,6 +776,7 @@ dependencies = [
 "pin-project",
 "serde",
 "time",
+ "tz-rs",
 "url",
 "uuid",
 ]
@@ -1290,6 +1291,12 @@ dependencies = [
 "tiny-keccak",
 ]

+[[package]]
+name = "const_fn"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
+
 [[package]]
 name = "const_format"
 version = "0.2.30"
@@ -1969,6 +1976,21 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"

+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -2598,6 +2620,19 @@ dependencies = [
 "tokio-io-timeout",
 ]

+[[package]]
+name = "hyper-tls"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
+dependencies = [
+ "bytes",
+ "hyper 0.14.26",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -2915,12 +2950,6 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"

-[[package]]
-name = "linux-raw-sys"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
-
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -3139,6 +3168,24 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "native-tls"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -3309,6 +3356,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "num_threads"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "oauth2"
 version = "4.4.2"
@@ -3358,12 +3414,50 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"

+[[package]]
+name = "openssl"
+version = "0.10.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
+dependencies = [
+ "bitflags 2.4.1",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

+[[package]]
+name = "openssl-sys"
+version = "0.9.96"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "opentelemetry"
 version = "0.20.0"
@@ -3570,7 +3664,6 @@ dependencies = [
 "serde",
 "serde_json",
 "svg_fmt",
- "thiserror",
 "tokio",
 "tokio-util",
 "toml_edit",
@@ -4012,6 +4105,17 @@ dependencies = [
 "tokio-postgres",
 ]

+[[package]]
+name = "postgres-native-tls"
+version = "0.5.0"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+dependencies = [
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
@@ -4120,7 +4224,6 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -4310,7 +4413,6 @@ dependencies = [
 "http 1.1.0",
 "http-body-util",
 "humantime",
- "humantime-serde",
 "hyper 0.14.26",
 "hyper 1.2.0",
 "hyper-util",
@@ -4321,6 +4423,7 @@ dependencies = [
 "md5",
 "measured",
 "metrics",
+ "native-tls",
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
@@ -4328,6 +4431,7 @@ dependencies = [
 "parquet_derive",
 "pbkdf2",
 "pin-project-lite",
+ "postgres-native-tls",
 "postgres-protocol",
 "postgres_backend",
 "pq_proto",
@@ -4346,7 +4450,6 @@ dependencies = [
 "rstest",
 "rustc-hash",
 "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
 "serde",
@@ -4376,6 +4479,7 @@ dependencies = [
 "utils",
 "uuid",
 "walkdir",
+ "webpki-roots 0.25.2",
 "workspace_hack",
 "x509-parser",
 ]
@@ -4682,21 +4786,20 @@ dependencies = [
 "http 0.2.9",
 "http-body 0.4.5",
 "hyper 0.14.26",
- "hyper-rustls 0.24.0",
+ "hyper-tls",
 "ipnet",
 "js-sys",
 "log",
 "mime",
+ "native-tls",
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.11",
- "rustls-pemfile 1.0.2",
 "serde",
 "serde_json",
 "serde_urlencoded",
 "tokio",
- "tokio-rustls 0.24.0",
+ "tokio-native-tls",
 "tokio-util",
 "tower-service",
 "url",
@@ -4704,7 +4807,6 @@ dependencies = [
 "wasm-bindgen-futures",
 "wasm-streams 0.3.0",
 "web-sys",
- "webpki-roots 0.25.2",
 "winreg 0.50.0",
 ]

@@ -5109,6 +5211,51 @@ version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"

+[[package]]
+name = "s3_scrubber"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-stream",
+ "aws-config",
+ "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
+ "camino",
+ "chrono",
+ "clap",
+ "crc32c",
+ "either",
+ "futures",
+ "futures-util",
+ "hex",
+ "histogram",
+ "itertools",
+ "native-tls",
+ "pageserver",
+ "pageserver_api",
+ "postgres-native-tls",
+ "postgres_ffi",
+ "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-rustls 0.25.0",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "safekeeper"
 version = "0.1.0"
@@ -5765,54 +5912,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storage_scrubber"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-stream",
- "aws-config",
- "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
- "camino",
- "chrono",
- "clap",
- "crc32c",
- "either",
- "futures",
- "futures-util",
- "hex",
- "histogram",
- "humantime",
- "itertools",
- "once_cell",
- "pageserver",
- "pageserver_api",
- "postgres_ffi",
- "rand 0.8.5",
- "remote_storage",
- "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "serde",
- "serde_json",
- "serde_with",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-appender",
- "tracing-subscriber",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
@@ -5820,8 +5919,6 @@ dependencies = [
 "anyhow",
 "clap",
 "comfy-table",
- "futures",
- "humantime",
 "hyper 0.14.26",
 "pageserver_api",
 "pageserver_client",
@@ -6092,6 +6189,8 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
 "itoa",
 "js-sys",
+ "libc",
+ "num_threads",
 "serde",
 "time-core",
 "time-macros",
@@ -6167,7 +6266,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6201,6 +6300,16 @@ dependencies = [
 "syn 2.0.52",
 ]

+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
@@ -6607,6 +6716,15 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"

+[[package]]
+name = "tz-rs"
+version = "0.6.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
+dependencies = [
+ "const_fn",
+]
+
 [[package]]
 name = "uname"
 version = "0.1.1"
@@ -6679,12 +6797,11 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "bytes",
 "io-uring",
 "libc",
- "linux-raw-sys 0.6.4",
 ]

 [[package]]
@@ -7512,9 +7629,9 @@ dependencies = [

 [[package]]
 name = "zeroize"
-version = "1.7.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
+checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
 dependencies = [
 "zeroize_derive",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
-    "storage_scrubber",
+    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -46,10 +46,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_core = "0.19"
+azure_identity = "0.19"
+azure_storage = "0.19"
+azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -114,13 +114,14 @@ md5 = "0.7.0"
 measured = { version = "0.0.21", features=["lasso"] }
 measured-process = { version = "0.0.21" }
 memoffset = "0.8"
+native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
 opentelemetry = "0.20.0"
-opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
@@ -128,7 +129,7 @@ parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
-prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
+prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
@@ -184,13 +185,13 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-rustls-native-certs = "0.7"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -199,6 +200,7 @@ log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -239,7 +241,8 @@ tonic-build = "0.9"

 [patch.crates-io]

-# Needed to get `tokio-postgres-rustls` to depend on our fork.
+# This is only needed for proxy's tests.
+# TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 # bug fixes for UUID
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.79.0
+ENV RUSTC_VERSION=1.78.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -89,7 +89,7 @@ RUN apt update && \
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /
@@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
+    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -149,7 +149,7 @@ RUN apt update && \

 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
-    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
+    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    # generate and copy upgrade scripts
    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
    cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
    echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
    cmake .. -DCMAKE_BUILD_TYPE=Release && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz

 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
    echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -243,12 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/pgvector.patch /pgvector.patch

-# By default, pgvector Makefile uses `-march=native`. We don't want that,
+# By default, pgvector Makefile uses `-march=native`. We don't want that, 
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
-    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
-    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
+    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
+    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -266,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
    echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control

@@ -281,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
+    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
@@ -297,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
@@ -313,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
-    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
@@ -329,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
    echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
+    mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
@@ -345,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
+    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
@@ -361,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
+    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
@@ -377,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
+    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
@@ -393,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
@@ -424,7 +424,7 @@ RUN case "${PG_VERSION}" in \
    apt-get install -y cmake && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -462,7 +462,7 @@ RUN case "${PG_VERSION}" in \
    esac && \
    wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
    echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
@@ -481,7 +481,7 @@ RUN apt-get update && \
    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
+    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
@@ -505,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
-    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
@@ -531,7 +531,7 @@ RUN apt-get update && \
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
+    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
        -D RDK_BUILD_INCHI_SUPPORT=ON \
@@ -571,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
@@ -588,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
@@ -605,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
@@ -631,7 +631,7 @@ RUN case "${PG_VERSION}" in \
    esac && \
    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

@@ -647,7 +647,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
@@ -696,7 +696,7 @@ ARG PG_VERSION

 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -713,7 +713,7 @@ ARG PG_VERSION

 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
-    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
+    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    # it's needed to enable extension because it uses untrusted C language
@@ -733,7 +733,7 @@ ARG PG_VERSION
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
-    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

@@ -749,7 +749,7 @@ ARG PG_VERSION

 RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
-    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
@@ -771,7 +771,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

@@ -787,7 +787,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
@@ -804,7 +804,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
@@ -928,69 +928,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-
-#########################################################################################
-#
-# Layer neon-pg-ext-test
-#
-#########################################################################################
-
-FROM neon-pg-ext-build AS neon-pg-ext-test
-ARG PG_VERSION
-RUN mkdir /ext-src
-
-#COPY --from=postgis-build /postgis.tar.gz /ext-src/
-#COPY --from=postgis-build /sfcgal/* /usr
-COPY --from=plv8-build /plv8.tar.gz /ext-src/
-COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
-COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.patch /ext-src/
-COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
-#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
-#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
-#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
-COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
-COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-#COPY --from=rum-pg-build /rum.tar.gz /ext-src
-#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
-COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
-COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
-COPY --from=hll-pg-build /hll.tar.gz /ext-src
-COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
-#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
-COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hintplan.patch /ext-src
-#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
-COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY patches/pg_cron.patch /ext-src
-#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
-COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
-COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
-COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
-#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
-#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
-COPY patches/pg_anon.patch /ext-src
-COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
-COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN cd /ext-src/ && for f in *.tar.gz; \
-    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
-    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
-    || exit 1; rm -f $f; done
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-# cmake is required for the h3 test
-RUN apt-get update && apt-get install -y cmake
-RUN patch -p1 < /ext-src/pg_hintplan.patch
-COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN patch -p1 </ext-src/pg_anon.patch
-RUN patch -p1 </ext-src/pg_cron.patch
-ENV PATH=/usr/local/pgsql/bin:$PATH
-ENV PGHOST=compute
-ENV PGPORT=55433
-ENV PGUSER=cloud_admin
-ENV PGDATABASE=postgres
 #########################################################################################
 #
 # Final layer
--- a/2
+++ b/2
@@ -124,8 +124,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
-	+@echo "Compiling test_decoding $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -735,7 +735,7 @@ fn cli() -> clap::Command {
            Arg::new("filecache-connstr")
                .long("filecache-connstr")
                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
                )
                .value_name("FILECACHE_CONNSTR"),
        )
--- a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
@@ -1 +0,0 @@
-ALTER ROLE neon_superuser BYPASSRLS;
--- a/compute_tools/src/migrations/0001-alter_roles.sql
+++ b/compute_tools/src/migrations/0001-alter_roles.sql
@@ -1,18 +0,0 @@
-DO $$
-DECLARE
-    role_name text;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
-    END LOOP;
-
-    FOR role_name IN SELECT rolname FROM pg_roles
-        WHERE
-            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
-    END LOOP;
-END $$;
--- a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
@@ -1,6 +0,0 @@
-DO $$
-BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
-    END IF;
-END $$;
--- a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
@@ -1 +0,0 @@
-GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
--- a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
@@ -1,4 +0,0 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
--       interacted with by neon_superuser without permission issues.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
--- a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
@@ -1,4 +0,0 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
--       interacted with by neon_superuser without permission issues.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
--- a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
@@ -1,3 +0,0 @@
-- SKIP: Moved inline to the handle_grants() functions.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
--- a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
@@ -1,3 +0,0 @@
-- SKIP: Moved inline to the handle_grants() functions.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
--- a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
@@ -1,13 +0,0 @@
-- SKIP: The original goal of this migration was to prevent creating
--       subscriptions, but this migration was insufficient.
-
-DO $$
-DECLARE
-    role_name TEXT;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
-    END LOOP;
-END $$;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -774,21 +774,44 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

-    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0001-alter_roles.sql"),
-        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
-        include_str!(
-            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
-        ),
-        include_str!(
-            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
-        ),
-        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
+        "ALTER ROLE neon_superuser BYPASSRLS",
+        r#"
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
+
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
+"#,
+        r#"
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END
+$$;"#,
+        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
+        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
+        "",
+        "",
+        "",
+        "",
+        "",
+        // Add new migrations below.
    ];

    let mut func = || {
@@ -824,13 +847,10 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", current_migration);
+        if migration.is_empty() {
+            info!("Skip migration id={}", current_migration);
        } else {
-            info!(
-                "Running migration id={}:\n{}\n",
-                current_migration, migration
-            );
+            info!("Running migration:\n{}\n", migration);
            client.simple_query(migration).with_context(|| {
                format!("handle_migrations current_migration={}", current_migration)
            })?;
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,8 +9,6 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-futures.workspace = true
-humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,4 +1,3 @@
-use futures::StreamExt;
 use std::{collections::HashMap, str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
@@ -8,9 +7,8 @@ use pageserver_api::{
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
-        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -127,44 +125,6 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
-    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
-    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
-    TenantDrop {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        unclean: bool,
-    },
-    NodeDrop {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        unclean: bool,
-    },
-    TenantSetTimeBasedEviction {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        period: humantime::Duration,
-        #[arg(long)]
-        threshold: humantime::Duration,
-    },
-    // Drain a set of specified pageservers by moving the primary attachments to pageservers
-    // outside of the specified set.
-    Drain {
-        // Set of pageserver node ids to drain.
-        #[arg(long)]
-        nodes: Vec<NodeId>,
-        // Optional: migration concurrency (default is 8)
-        #[arg(long)]
-        concurrency: Option<usize>,
-        // Optional: maximum number of shards to migrate
-        #[arg(long)]
-        max_shards: Option<usize>,
-        // Optional: when set to true, nothing is migrated, but the plan is printed to stdout
-        #[arg(long)]
-        dry_run: Option<bool>,
-    },
 }

 #[derive(Parser)]
@@ -714,234 +674,6 @@ async fn main() -> anyhow::Result<()> {
                }
            }
        }
-        Command::TenantDrop { tenant_id, unclean } => {
-            if !unclean {
-                anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant.  If you know what you're doing, add `--unclean` to proceed.")
-            }
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::POST,
-                    format!("debug/v1/tenant/{tenant_id}/drop"),
-                    None,
-                )
-                .await?;
-        }
-        Command::NodeDrop { node_id, unclean } => {
-            if !unclean {
-                anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it.  If you know what you're doing, add `--unclean` to proceed.")
-            }
-            storcon_client
-                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
-                .await?;
-        }
-        Command::TenantSetTimeBasedEviction {
-            tenant_id,
-            period,
-            threshold,
-        } => {
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: TenantConfig {
-                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
-                            EvictionPolicyLayerAccessThreshold {
-                                period: period.into(),
-                                threshold: threshold.into(),
-                            },
-                        )),
-                        ..Default::default()
-                    },
-                })
-                .await?;
-        }
-        Command::Drain {
-            nodes,
-            concurrency,
-            max_shards,
-            dry_run,
-        } => {
-            // Load the list of nodes, split them up into the drained and filled sets,
-            // and validate that draining is possible.
-            let node_descs = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            let mut node_to_drain_descs = Vec::new();
-            let mut node_to_fill_descs = Vec::new();
-
-            for desc in node_descs {
-                let to_drain = nodes.iter().any(|id| *id == desc.id);
-                if to_drain {
-                    node_to_drain_descs.push(desc);
-                } else {
-                    node_to_fill_descs.push(desc);
-                }
-            }
-
-            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Drain requested for node which doesn't exist.")
-            }
-
-            node_to_fill_descs.retain(|desc| {
-                matches!(desc.availability, NodeAvailabilityWrapper::Active)
-                    && matches!(
-                        desc.scheduling,
-                        NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
-                    )
-            });
-
-            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to drain to")
-            }
-
-            // Set the node scheduling policy to draining for the nodes which
-            // we plan to drain.
-            for node_desc in node_to_drain_descs.iter() {
-                let req = NodeConfigureRequest {
-                    node_id: node_desc.id,
-                    availability: None,
-                    scheduling: Some(NodeSchedulingPolicy::Draining),
-                };
-
-                storcon_client
-                    .dispatch::<_, ()>(
-                        Method::PUT,
-                        format!("control/v1/node/{}/config", node_desc.id),
-                        Some(req),
-                    )
-                    .await?;
-            }
-
-            // Perform the drain: move each tenant shard scheduled on a node to
-            // be drained to a node which is being filled. A simple round robin
-            // strategy is used to pick the new node.
-            let tenants = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-
-            let mut selected_node_idx = 0;
-
-            struct DrainMove {
-                tenant_shard_id: TenantShardId,
-                from: NodeId,
-                to: NodeId,
-            }
-
-            let mut moves: Vec<DrainMove> = Vec::new();
-
-            let shards = tenants
-                .into_iter()
-                .flat_map(|tenant| tenant.shards.into_iter());
-            for shard in shards {
-                if let Some(max_shards) = max_shards {
-                    if moves.len() >= max_shards {
-                        println!(
-                            "Stop planning shard moves since the requested maximum was reached"
-                        );
-                        break;
-                    }
-                }
-
-                let should_migrate = {
-                    if let Some(attached_to) = shard.node_attached {
-                        node_to_drain_descs
-                            .iter()
-                            .map(|desc| desc.id)
-                            .any(|id| id == attached_to)
-                    } else {
-                        false
-                    }
-                };
-
-                if !should_migrate {
-                    continue;
-                }
-
-                moves.push(DrainMove {
-                    tenant_shard_id: shard.tenant_shard_id,
-                    from: shard
-                        .node_attached
-                        .expect("We only migrate attached tenant shards"),
-                    to: node_to_fill_descs[selected_node_idx].id,
-                });
-                selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
-            }
-
-            let total_moves = moves.len();
-
-            if dry_run == Some(true) {
-                println!("Dryrun requested. Planned {total_moves} moves:");
-                for mv in &moves {
-                    println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
-                }
-
-                return Ok(());
-            }
-
-            const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
-            let mut stream = futures::stream::iter(moves)
-                .map(|mv| {
-                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
-                    async move {
-                        client
-                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                                Method::PUT,
-                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
-                            )
-                            .await
-                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
-                    }
-                })
-                .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
-
-            let mut success = 0;
-            let mut failure = 0;
-
-            while let Some(res) = stream.next().await {
-                match res {
-                    Ok(_) => {
-                        success += 1;
-                    }
-                    Err((tenant_shard_id, from, to, error)) => {
-                        failure += 1;
-                        println!(
-                            "Failed to migrate {} from node {} to node {}: {}",
-                            tenant_shard_id, from, to, error
-                        );
-                    }
-                }
-
-                if (success + failure) % 20 == 0 {
-                    println!(
-                        "Processed {}/{} shards: {} succeeded, {} failed",
-                        success + failure,
-                        total_moves,
-                        success,
-                        failure
-                    );
-                }
-            }
-
-            println!(
-                "Processed {}/{} shards: {} succeeded, {} failed",
-                success + failure,
-                total_moves,
-                success,
-                failure
-            );
-        }
    }

    Ok(())
--- a/deny.toml
+++ b/deny.toml
@@ -99,13 +99,6 @@ name = "async-executor"
 [[bans.deny]]
 name = "smol"

-[[bans.deny]]
-# We want to use rustls instead of the platform's native tls implementation.
-name = "native-tls"
-
-[[bans.deny]]
-name = "openssl"
-
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -8,11 +8,6 @@ USER root
 RUN apt-get update &&       \
    apt-get install -y curl \
                       jq   \
-                       python3-pip \
                       netcat
-#Faker is required for the pg_anon test
-RUN pip3 install Faker
-#This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 

-USER postgres
+USER postgres
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -95,7 +95,7 @@
            },
            {
                "name": "shared_preload_libraries",
-                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
+                "value": "neon",
                "vartype": "string"
            },
            {
@@ -127,16 +127,6 @@
                "name": "max_replication_flush_lag",
                "value": "10GB",
                "vartype": "string"
-            },
-            {
-                "name": "cron.database",
-                "value": "postgres",
-                "vartype": "string"
-            },
-            {
-                "name": "session_preload_libraries",
-                "value": "anon",
-                "vartype": "string"
            }
        ]
    },
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -1,3 +1,5 @@
+version: '3'
+
 services:
  minio:
    restart: always
@@ -159,12 +161,12 @@ services:
      context: ./compute_wrapper/
      args:
        - REPOSITORY=${REPOSITORY:-neondatabase}
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
-      - PG_VERSION=${PG_VERSION:-16}
+      - PG_VERSION=${PG_VERSION:-14}
      #- RUST_BACKTRACE=1
    # Mount the test files directly, for faster editing cycle.
    volumes:
@@ -192,14 +194,3 @@ services:
         done"
    depends_on:
      - compute
-
-  neon-test-extensions:
-    profiles: ["test-extensions"]
-    image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
-    entrypoint:
-      - "/bin/bash"
-      - "-c"
-    command:
-      - sleep 1800
-    depends_on:
-      - compute
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -7,94 +7,52 @@
 # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
-#
-# A test script for postgres extensions
-# Currently supports only v16
-#
+
 set -eux -o pipefail

-COMPOSE_FILE='docker-compose.yml'
-cd $(dirname $0)
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
+
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
-TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
-PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
-: ${http_proxy:=}
-: ${https_proxy:=}
-export http_proxy https_proxy
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"

 cleanup() {
    echo "show container information"
    docker ps
-    docker compose --profile test-extensions -f $COMPOSE_FILE logs
+    docker compose -f $COMPOSE_FILE logs
    echo "stop containers..."
-    docker compose --profile test-extensions -f $COMPOSE_FILE down
+    docker compose -f $COMPOSE_FILE down
 }

+echo "clean up containers if exists"
+cleanup
+
 for pg_version in 14 15 16; do
-    echo "clean up containers if exists"
-    cleanup
-    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
+    echo "start containers (pg_version=$pg_version)."
+    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

    echo "wait until the compute is ready. timeout after 60s. "
    cnt=0
-    while sleep 3; do
+    while sleep 1; do
        # check timeout
-        cnt=`expr $cnt + 3`
+        cnt=`expr $cnt + 1`
        if [ $cnt -gt 60 ]; then
            echo "timeout before the compute is ready."
            cleanup
            exit 1
        fi
-        if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
+
+        # check if the compute is ready
+        set +o pipefail
+        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+        set -o pipefail
+        if [ $result -eq 1 ]; then
            echo "OK. The compute is ready to connect."
            echo "execute simple queries."
            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+            cleanup
            break
        fi
    done
-
-    if [ $pg_version -ge 16 ]
-    then
-        echo Enabling trust connection
-        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
-        echo Adding postgres role
-        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
-        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
-        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
-        echo Adding dummy config
-        docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
-        # This block is required for the pg_anon extension test.
-        # The test assumes that it is running on the same host with the postgres engine.
-        # In our case it's not true, that's why we are copying files to the compute node
-        TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
-        rm -rf $TMPDIR
-        TMPDIR=$(mktemp -d)
-        # The following block does the same for the pg_hintplan test
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
-        rm -rf $TMPDIR
-        # We are running tests now
-        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
-            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
-        then
-            cleanup
-        else
-            FAILED=$(tail -1 testout.txt)
-            for d in $FAILED
-            do
-                mkdir $d
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
-                cat $d/regression.out $d/regression.diffs || true
-            done
-        rm -rf $FAILED
-        cleanup
-        exit 1
-        fi
-    fi
-    cleanup
 done
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -x
-
-cd /ext-src
-FAILED=
-LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
-for d in ${LIST}
-do
-       [ -d ${d} ] || continue
-    psql -c "select 1" >/dev/null || break
-       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
-done
-[ -z "${FAILED}" ] && exit 0
-echo ${FAILED}
-exit 1
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,18 +4,18 @@

 Currently we build two main images:

- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.
+- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).

 And additional intermediate image:

 - [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.

-## Build pipeline
+## Building pipeline

 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-tools` and `neondatabase/compute-node`

 2. `neondatabase/neon`

@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
 1. create containers

 You can specify version of neon cluster using following environment values.
- PG_VERSION: postgres version for compute (default is 16 as of this writing)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest'
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
 $ cd docker-compose/
 $ docker-compose down   # remove the containers if exists
-$ PG_VERSION=16 TAG=latest docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
@@ -47,31 +47,29 @@ Creating docker-compose_storage_broker_1       ... done

 2. connect compute node
 ```
-$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
-psql (16.3)
-Type "help" for help.
-
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ chmod 600 ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
-postgres=# insert into t values(1, 1);
+postgres=# insert into t values(1,1);
 INSERT 0 1
 postgres=# select * from t;
- key | value 
+ key | value
 -----+-------
   1 | 1
 (1 row)
-
 ```

 3. If you want to see the log, you can use `docker-compose logs` command.
 ```
 # check the container name you want to see
 $ docker ps
-CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                      NAMES
-3582f6d76227   docker-compose_compute                             "/shell/compute.sh"      2 minutes ago   Up 2 minutes   0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp   docker-compose_compute_1
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
 (...omit...)

-$ docker logs -f docker-compose_compute_1
+$ docker logs -f dockercompose_compute_1
 2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
 2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
 (...omit...)
--- a/docs/rfcs/033-pageserver-postgres-bundle.md
+++ b/docs/rfcs/033-pageserver-postgres-bundle.md
@@ -1,139 +0,0 @@
-# Postgres Bundle for Pageserver
-
-Created on 2024-06-17
-
-## Summary
-
-This RFC defines the responsibilities of Compute and Storage team regarding the
-build & deployment of the Postgres code that Pageserver must run
-(`initdb`, `postgres --wal-redo`).
-
-## Motivation
-
-Pageserver has to run Postgres binaries to do its job, specifically
-
-* `initdb`
-* `postgres --wal-redo` mode
-
-Currently there is **no clear ownership** of
-* how these binaries are built
-  * including, critically, dynamic linkage against other libraries such as `libicu`
-* what build of the binaries ends up running on Pageservers
-* how the binaries and runtime dependencies (e.g., shared libraries) are delivered to Pageservers
-
-Further, these binaries have dependencies (e.g., libicu) which
-1. prevent the Storage team from switching Pageserver distro and/or version, and
-2. some dependencies impact compatibility between Storage and Compute (e.g., [libicu version impacts collation incompatibilty](https://github.com/neondatabase/neon/pull/8074))
-3. some dependencies can cause database corruption if updated carelessly (locale => libc)
-
-## Why Is This Worth Solving
-
-1. Clearly defined ownership generally boosts execution speed & bug triage.
-   * Example for why execution speed matters: CVE in dependency => who takes care of patching & updating.
-2. Centralize understanding of risks involved with some dependencies.
-   Currently, there is no team clearly responsible for assessing / tracking the risks. As a reminder from previous section, these are
-   * runtime incompatibilities
-   * database corruption
-
-Also, it is an unlock for additional future value, see "Future Work" section.
-
-## Impacted components (e.g. pageserver, safekeeper, console, etc)
-
-Pageserver (neon.git)
-Compute (neon.git)
-Deployment process (aws.git)
-
-## Design
-
-The basic interface between Compute and Storage team is as follows:
-
-* Compute team publishes a "bundle" of the binaries required by Pageserver
-* Storage team uses a pinned bundle in the Pageserver build process
-* Storage team code review is required to update the pinned version
-
-The "bundle" provides an interface agreed upon by Compute and Storage teams to run
-* for each supported Postgres version at Neon (v14, v15, v16, ...)
-  * the `initdb` process
-    * behaving like a vanilla Postgres `initdb`
-  * `postgres --wal-redo` mode process
-    * following the walredo protocol specified elsewhere
-
-The bundle is self-contained, i.e., it behaves the same way on any Linux system.
-The only ambient runtime dependency is the Linux kernel.
-The minimum Linux kernel version is 5.10.
-
-### Variant 1: bundle = fully statically linked binaries
-The "bundle" is a tarball of fully statically linked binaries
-
-```
-v14/initdb
-v14/postgres
-v15/initdb
-v15/postgres
-v16/initdb
-v16/postgres
-...
-```
-
-The directory structure is part of the interface.
-
-### Variant 2: bundle = chrooted directory
-
-The "bundle" is a tarball that contains all sorts of files, plus a launcher script.
-
-```
-LAUNCHER
-storage
-storage/does
-storage/does/not
-storage/does/not/care
-```
-
-To launch `initdb` or `postgres --wal-redo`, the Pageserver does
-1. fork child process
-2. `chroot` into the extracted directory
-3. inside the chroot, run `/LAUNCHER VERSION PG_BINARY [FLAGS...]`
-4. The `LAUNCHER` script sets up library search paths, etc, and then `exec`s the correct binary
-
-We acknowledge this is half-way reinventing OCI + linux containers.
-However, our needs are much simpler than what OCI & Docker provide.
-Specifically, we do not want Pageserver to be runtime-dependent on e.g. Docker as the launcher.
-
-The `chroot` is to enforce that the "bundle" be self-contained.
-The special path `/inout` int he bundle is reserved, e.g., for `initdb` output.
-
-### Variant 3: ???
-
-Your design here, feedback welcome.
-
-## Security implications
-
-It's an improvement because a single team (Compute) will be responsible for runtime dependencies.
-
-## Implementation & Rollout
-
-Storage and Compute teams agree on a bundle definition.
-
-Compute team changes their build process to produce both
-1. existing: compute image / vm compute image
-2. existing: pg_install tarball (currently built by `neon.git:Dockerfile`)
-2. new: the bundle
-
-Storage makes `neon.git` Pageserver changes to support using bundle (behind feature flag).
-With feature flag disabled, existing `pg_install` tarball is used instead.
-
-Storage & infra make `aws.git` changes to deploy bundle to pageservers, with feature flag disabled.
-
-Storage team does gradual rollout.
-
-Storage & infra teams remove support for `pg_install`, delete it from the nodes (experimentation in staging to ensure no hidden runtime deps!)
-
-Compute team stops producing `pg_install` tarball.
-
-
-## Future Work
-
-We know that we can easily make pageserver fully statically linked.
-Together with the self-contained "bundle" proposed above, Pageserver can then be deployed to different OSes.
-For example, we have been entertaining the idea of trying Amazon Linux instead of Debian for Pageserver.
-That experiment would be a lot simpler.
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,7 +1,7 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::RepOriginId;
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
@@ -39,9 +39,6 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 /// The key prefix of AUX file keys.
 pub const AUX_KEY_PREFIX: u8 = 0x62;

-/// The key prefix of ReplOrigin keys.
-pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
-
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -56,8 +53,14 @@ impl Key {
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
        assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        // Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
-        Self::from_i128(i128::from_be_bytes(*key))
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
    }

    /// Encode a metadata key to a storage key.
@@ -65,6 +68,17 @@ impl Key {
        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
    }

+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
    /// Get the range of metadata keys.
    pub const fn metadata_key_range() -> Range<Self> {
        Key {
@@ -107,7 +121,7 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
@@ -161,7 +175,7 @@ impl Key {
    }

    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
+    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -174,7 +188,7 @@ impl Key {
    }

    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -385,14 +399,7 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
        field3: rel.dbnode,
        field4: rel.relnode,
        field5: rel.forknum,
-        field6: 0xffff_ffff,
-    }
-}
-
-impl Key {
-    #[inline(always)]
-    pub fn is_rel_size_key(&self) -> bool {
-        self.field1 == 0 && self.field6 == u32::MAX
+        field6: 0xffffffff,
    }
 }

@@ -433,25 +440,6 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
    }
 }

-#[inline(always)]
-pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
-    if key.field1 == 0x01
-        && key.field3 == 0
-        && key.field4 == 0
-        && key.field5 == 0
-        && key.field6 == 0
-    {
-        match key.field2 {
-            0 => Some(Ok(SlruKind::Clog)),
-            1 => Some(Ok(SlruKind::MultiXactMembers)),
-            2 => Some(Ok(SlruKind::MultiXactOffsets)),
-            x => Some(Err(x)),
-        }
-    } else {
-        None
-    }
-}
-
 #[inline(always)]
 pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
    Key {
@@ -480,17 +468,7 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
        field3: 1,
        field4: segno,
        field5: 0,
-        field6: 0xffff_ffff,
-    }
-}
-
-impl Key {
-    pub fn is_slru_segment_size_key(&self) -> bool {
-        self.field1 == 0x01
-            && self.field2 < 0x03
-            && self.field3 == 0x01
-            && self.field5 == 0
-            && self.field6 == u32::MAX
+        field6: 0xffffffff,
    }
 }

@@ -591,37 +569,6 @@ pub const AUX_FILES_KEY: Key = Key {
    field6: 2,
 };

-#[inline(always)]
-pub fn repl_origin_key(origin_id: RepOriginId) -> Key {
-    Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: origin_id as u32,
-    }
-}
-
-/// Get the range of replorigin keys.
-pub fn repl_origin_key_range() -> Range<Key> {
-    Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0x10000,
-    }
-}
-
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

@@ -630,78 +577,73 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
 pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();

-impl Key {
-    // AUX_FILES currently stores only data for logical replication (slots etc), and
-    // we don't preserve these on a branch because safekeepers can't follow timeline
-    // switch (and generally it likely should be optional), so ignore these.
-    #[inline(always)]
-    pub fn is_inherited_key(self) -> bool {
-        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
-    }
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+#[inline(always)]
+pub fn is_inherited_key(key: Key) -> bool {
+    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
+}

-    #[inline(always)]
-    pub fn is_rel_fsm_block_key(self) -> bool {
-        self.field1 == 0x00
-            && self.field4 != 0
-            && self.field5 == FSM_FORKNUM
-            && self.field6 != 0xffffffff
-    }
+#[inline(always)]
+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}

-    #[inline(always)]
-    pub fn is_rel_vm_block_key(self) -> bool {
-        self.field1 == 0x00
-            && self.field4 != 0
-            && self.field5 == VISIBILITYMAP_FORKNUM
-            && self.field6 != 0xffffffff
-    }
+#[inline(always)]
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}

-    #[inline(always)]
-    pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-        Ok(match self.field1 {
-            0x01 => {
-                let kind = match self.field2 {
-                    0x00 => SlruKind::Clog,
-                    0x01 => SlruKind::MultiXactMembers,
-                    0x02 => SlruKind::MultiXactOffsets,
-                    _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
-                };
-                let segno = self.field4;
-                let blknum = self.field6;
+#[inline(always)]
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+    Ok(match key.field1 {
+        0x01 => {
+            let kind = match key.field2 {
+                0x00 => SlruKind::Clog,
+                0x01 => SlruKind::MultiXactMembers,
+                0x02 => SlruKind::MultiXactOffsets,
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
+            };
+            let segno = key.field4;
+            let blknum = key.field6;

-                (kind, segno, blknum)
-            }
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
-        })
-    }
+            (kind, segno, blknum)
+        }
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}

-    #[inline(always)]
-    pub fn is_slru_block_key(self) -> bool {
-        self.field1 == 0x01                // SLRU-related
-        && self.field3 == 0x00000001   // but not SlruDir
-        && self.field6 != 0xffffffff // and not SlruSegSize
-    }
+#[inline(always)]
+pub fn is_slru_block_key(key: Key) -> bool {
+    key.field1 == 0x01                // SLRU-related
+        && key.field3 == 0x00000001   // but not SlruDir
+        && key.field6 != 0xffffffff // and not SlruSegSize
+}

-    #[inline(always)]
-    pub fn is_rel_block_key(&self) -> bool {
-        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
-    }
+#[inline(always)]
+pub fn is_rel_block_key(key: &Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
+}

-    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
-    #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
-        Ok(match self.field1 {
-            0x00 => (
-                RelTag {
-                    spcnode: self.field2,
-                    dbnode: self.field3,
-                    relnode: self.field4,
-                    forknum: self.field5,
-                },
-                self.field6,
-            ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
-        })
-    }
+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+#[inline(always)]
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
+    Ok(match key.field1 {
+        0x00 => (
+            RelTag {
+                spcnode: key.field2,
+                dbnode: key.field3,
+                relnode: key.field4,
+                forknum: key.field5,
+            },
+            key.field6,
+        ),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
 }

 impl std::str::FromStr for Key {
@@ -745,15 +687,10 @@ mod tests {
        let mut metadata_key = vec![AUX_KEY_PREFIX];
        metadata_key.extend_from_slice(&[0xFF; 15]);
        let encoded_key = Key::from_metadata_key(&metadata_key);
-        let output_key = encoded_key.to_i128().to_be_bytes();
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
        assert_eq!(metadata_key, output_key);
        assert!(encoded_key.is_metadata_key());
        assert!(is_metadata_key_slice(&metadata_key));
    }
-
-    #[test]
-    fn test_possible_largest_key() {
-        Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
-        // TODO: put this key into the system and see if anything breaks.
-    }
 }
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
 use std::fmt;

 use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
+use postgres_ffi::relfile_utils::forknumber_to_name;
 use postgres_ffi::Oid;

 ///
@@ -68,57 +68,6 @@ impl fmt::Display for RelTag {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub enum ParseRelTagError {
-    #[error("invalid forknum")]
-    InvalidForknum(#[source] std::num::ParseIntError),
-    #[error("missing triplet member {}", .0)]
-    MissingTripletMember(usize),
-    #[error("invalid triplet member {}", .0)]
-    InvalidTripletMember(usize, #[source] std::num::ParseIntError),
-}
-
-impl std::str::FromStr for RelTag {
-    type Err = ParseRelTagError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        use ParseRelTagError::*;
-
-        // FIXME: in postgres logs this separator is dot
-        // Example:
-        //     could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
-        // with a regex we could get this more painlessly
-        let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
-            Some((t, f)) => {
-                let forknum = forkname_to_number(Some(f));
-                let forknum = if let Ok(f) = forknum {
-                    f
-                } else {
-                    f.parse::<u8>().map_err(InvalidForknum)?
-                };
-
-                (t, Some(forknum))
-            }
-            None => (s, None),
-        };
-
-        let mut split = triplet
-            .splitn(3, '/')
-            .enumerate()
-            .map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
-        let spcnode = split.next().ok_or(MissingTripletMember(0))??;
-        let dbnode = split.next().ok_or(MissingTripletMember(1))??;
-        let relnode = split.next().ok_or(MissingTripletMember(2))??;
-
-        Ok(RelTag {
-            spcnode,
-            forknum: forknum.unwrap_or(MAIN_FORKNUM),
-            dbnode,
-            relnode,
-        })
-    }
-}
-
 impl RelTag {
    pub fn to_segfile_name(&self, segno: u32) -> String {
        let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,6 +1,9 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::{key::Key, models::ShardParameters};
+use crate::{
+    key::{is_rel_block_key, Key},
+    models::ShardParameters,
+};
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -425,12 +428,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);

-impl Default for ShardStripeSize {
-    fn default() -> Self {
-        DEFAULT_STRIPE_SIZE
-    }
-}
-
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
@@ -669,7 +666,7 @@ fn key_is_shard0(key: &Key) -> bool {
    // because they must be included in basebackups.
    let is_initfork = key.field5 == INIT_FORKNUM;

-    !key.is_rel_block_key() || is_initfork
+    !is_rel_block_key(key) || is_initfork
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
@@ -716,25 +713,6 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
    ShardNumber((hash % count.0 as u32) as u8)
 }

-/// For debugging, while not exposing the internals.
-#[derive(Debug)]
-#[allow(unused)] // used by debug formatting by pagectl
-struct KeyShardingInfo {
-    shard0: bool,
-    shard_number: ShardNumber,
-}
-
-pub fn describe(
-    key: &Key,
-    shard_count: ShardCount,
-    stripe_size: ShardStripeSize,
-) -> impl std::fmt::Debug {
-    KeyShardingInfo {
-        shard0: key_is_shard0(key),
-        shard_number: key_to_shard_number(shard_count, stripe_size, key),
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use utils::Hex;
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -126,7 +126,6 @@ fn main() -> anyhow::Result<()> {
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
            .allowlist_type("RelMapFile")
-            .allowlist_type("RepOriginId")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -110,7 +110,6 @@ pub mod pg_constants;
 pub mod relfile_utils;

 // Export some widely used datatypes that are unlikely to change across Postgres versions
-pub use v14::bindings::RepOriginId;
 pub use v14::bindings::{uint32, uint64, Oid};
 pub use v14::bindings::{BlockNumber, OffsetNumber};
 pub use v14::bindings::{MultiXactId, TransactionId};
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
 pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
 pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
 pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
-pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
+// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
 // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
 // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;

@@ -167,7 +167,6 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
-pub const RM_REPLORIGIN_ID: u8 = 19;
 pub const RM_LOGICALMSG_ID: u8 = 21;

 // from neon_rmgr.h
@@ -224,10 +223,6 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

-/* From xlog.h */
-pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
-pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
-
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
   + 64 /* NameData */  + 4*4;
@@ -242,9 +237,6 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
 pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
    (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)

-/* From origin.c */
-pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
-
 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
 pub const PGDATA_SUBDIRS: [&str; 22] = [
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 bytes.workspace = true
 byteorder.workspace = true
-itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,9 +7,8 @@ pub mod framed;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use itertools::Itertools;
 use serde::{Deserialize, Serialize};
-use std::{borrow::Cow, fmt, io, str};
+use std::{borrow::Cow, collections::HashMap, fmt, io, str};

 // re-export for use in utils pageserver_feedback.rs
 pub use postgres_protocol::PG_EPOCH;
@@ -51,37 +50,15 @@ pub enum FeStartupPacket {
    },
 }

-#[derive(Debug, Clone, Default)]
-pub struct StartupMessageParamsBuilder {
-    params: BytesMut,
-}
-
-impl StartupMessageParamsBuilder {
-    /// Set parameter's value by its name.
-    /// name and value must not contain a \0 byte
-    pub fn insert(&mut self, name: &str, value: &str) {
-        self.params.put(name.as_bytes());
-        self.params.put(&b"\0"[..]);
-        self.params.put(value.as_bytes());
-        self.params.put(&b"\0"[..]);
-    }
-
-    pub fn freeze(self) -> StartupMessageParams {
-        StartupMessageParams {
-            params: self.params.freeze(),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Default)]
+#[derive(Debug)]
 pub struct StartupMessageParams {
-    params: Bytes,
+    params: HashMap<String, String>,
 }

 impl StartupMessageParams {
    /// Get parameter's value by its name.
    pub fn get(&self, name: &str) -> Option<&str> {
-        self.iter().find_map(|(k, v)| (k == name).then_some(v))
+        self.params.get(name).map(|s| s.as_str())
    }

    /// Split command-line options according to PostgreSQL's logic,
@@ -135,19 +112,15 @@ impl StartupMessageParams {

    /// Iterate through key-value pairs in an arbitrary order.
    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        let params =
-            std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
-        params.split_terminator('\0').tuples()
+        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
    }

    // This function is mostly useful in tests.
    #[doc(hidden)]
    pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        let mut b = StartupMessageParamsBuilder::default();
-        for (k, v) in pairs {
-            b.insert(k, v)
+        Self {
+            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
        }
-        b.freeze()
    }
 }

@@ -372,21 +345,35 @@ impl FeStartupPacket {
            (major_version, minor_version) => {
                // StartupMessage

-                let s = str::from_utf8(&msg).map_err(|_e| {
-                    ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
-                })?;
-                let s = s.strip_suffix('\0').ok_or_else(|| {
-                    ProtocolError::Protocol(
-                        "StartupMessage params: missing null terminator".to_string(),
-                    )
-                })?;
+                // Parse pairs of null-terminated strings (key, value).
+                // See `postgres: ProcessStartupPacket, build_startup_packet`.
+                let mut tokens = str::from_utf8(&msg)
+                    .map_err(|_e| {
+                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                    })?
+                    .strip_suffix('\0') // drop packet's own null
+                    .ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: missing null terminator".to_string(),
+                        )
+                    })?
+                    .split_terminator('\0');
+
+                let mut params = HashMap::new();
+                while let Some(name) = tokens.next() {
+                    let value = tokens.next().ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: key without value".to_string(),
+                        )
+                    })?;
+
+                    params.insert(name.to_owned(), value.to_owned());
+                }

                FeStartupPacket::StartupMessage {
                    major_version,
                    minor_version,
-                    params: StartupMessageParams {
-                        params: msg.slice_ref(s.as_bytes()),
-                    },
+                    params: StartupMessageParams { params },
                }
            }
        };
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,7 +3,6 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
-use std::fmt::Display;
 use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
@@ -27,15 +26,14 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
-use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
-use utils::backoff;

-use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
+use crate::RemoteStorageActivity;
 use crate::{
-    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
-    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
+    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
+    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    TimeTravelError, TimeoutOrCancel,
 };

 pub struct AzureBlobStorage {
@@ -140,8 +138,6 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

-        let started_at = start_measuring_requests(kind);
-
        let download = async {
            let response = builder
                // convert to concrete Pageable
@@ -205,22 +201,13 @@ impl AzureBlobStorage {
            })
        };

-        let download = tokio::select! {
+        tokio::select! {
            bufs = download => bufs,
            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
+                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
            },
-        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        let outcome = match &download {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
-        download
+        }
    }

    async fn permit(
@@ -354,10 +341,7 @@ impl RemoteStorage for AzureBlobStorage {
        metadata: Option<StorageMetadata>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Put;
-        let _permit = self.permit(kind, cancel).await?;
-
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Put, cancel).await?;

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -381,25 +365,14 @@ impl RemoteStorage for AzureBlobStorage {
            match fut.await {
                Ok(Ok(_response)) => Ok(()),
                Ok(Err(azure)) => Err(azure.into()),
-                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
            }
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let outcome = match res {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
-
-        res
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
+        }
    }

    async fn download(
@@ -445,79 +418,40 @@ impl RemoteStorage for AzureBlobStorage {
        paths: &'a [RemotePath],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Delete, cancel).await?;

        let op = async {
-            // TODO batch requests are not supported by the SDK
+            // TODO batch requests are also not supported by the SDK
            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
            for path in paths {
-                #[derive(Debug)]
-                enum AzureOrTimeout {
-                    AzureError(azure_core::Error),
-                    Timeout,
-                    Cancel,
-                }
-                impl Display for AzureOrTimeout {
-                    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                        write!(f, "{self:?}")
-                    }
-                }
-                let warn_threshold = 3;
-                let max_retries = 5;
-                backoff::retry(
-                    || async {
-                        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+                let blob_client = self.client.blob_client(self.relative_path_to_name(path));

-                        let request = blob_client.delete().into_future();
+                let request = blob_client.delete().into_future();

-                        let res = tokio::time::timeout(self.timeout, request).await;
+                let res = tokio::time::timeout(self.timeout, request).await;

-                        match res {
-                            Ok(Ok(_v)) => Ok(()),
-                            Ok(Err(azure_err)) => {
-                                if let Some(http_err) = azure_err.as_http_error() {
-                                    if http_err.status() == StatusCode::NotFound {
-                                        return Ok(());
-                                    }
-                                }
-                                Err(AzureOrTimeout::AzureError(azure_err))
+                match res {
+                    Ok(Ok(_response)) => continue,
+                    Ok(Err(e)) => {
+                        if let Some(http_err) = e.as_http_error() {
+                            if http_err.status() == StatusCode::NotFound {
+                                continue;
                            }
-                            Err(_elapsed) => Err(AzureOrTimeout::Timeout),
                        }
-                    },
-                    |err| match err {
-                        AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false,
-                        AzureOrTimeout::Cancel => true,
-                    },
-                    warn_threshold,
-                    max_retries,
-                    "deleting remote object",
-                    cancel,
-                )
-                .await
-                .ok_or_else(|| AzureOrTimeout::Cancel)
-                .and_then(|x| x)
-                .map_err(|e| match e {
-                    AzureOrTimeout::AzureError(err) => anyhow::Error::from(err),
-                    AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(),
-                    AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(),
-                })?;
+                        return Err(e.into());
+                    }
+                    Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
+                }
            }
+
            Ok(())
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-        res
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
+        }
    }

    async fn copy(
@@ -526,9 +460,7 @@ impl RemoteStorage for AzureBlobStorage {
        to: &RemotePath,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Copy;
-        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Copy, cancel).await?;

        let timeout = tokio::time::sleep(self.timeout);

@@ -572,21 +504,15 @@ impl RemoteStorage for AzureBlobStorage {
            }
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
            _ = timeout => {
                let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
                let e = e.context(format!("Timeout, last status: {copy_status:?}"));
                Err(e)
            },
-        };
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-        res
+        }
    }

    async fn time_travel_recover(
@@ -600,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
        Err(TimeTravelError::Unimplemented)
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,7 +12,6 @@
 mod azure_blob;
 mod error;
 mod local_fs;
-mod metrics;
 mod s3_bucket;
 mod simulate_failures;
 mod support;
@@ -122,8 +121,8 @@ impl RemotePath {
        self.0.file_name()
    }

-    pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
-        Self(self.0.join(path))
+    pub fn join(&self, segment: &Utf8Path) -> Self {
+        Self(self.0.join(segment))
    }

    pub fn get_path(&self) -> &Utf8PathBuf {
@@ -264,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
        done_if_after: SystemTime,
        cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError>;
+
+    /// Query how busy we currently are: may be used by callers which wish to politely
+    /// back off if there are already a lot of operations underway.
+    fn activity(&self) -> RemoteStorageActivity;
+}
+
+pub struct RemoteStorageActivity {
+    pub read_available: usize,
+    pub read_total: usize,
+    pub write_available: usize,
+    pub write_total: usize,
 }

 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -445,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
            }
        }
    }
+
+    pub fn activity(&self) -> RemoteStorageActivity {
+        match self {
+            Self::LocalFs(s) => s.activity(),
+            Self::AwsS3(s) => s.activity(),
+            Self::AzureBlob(s) => s.activity(),
+            Self::Unreliable(s) => s.activity(),
+        }
+    }
 }

 impl GenericRemoteStorage {
@@ -775,6 +794,9 @@ struct ConcurrencyLimiter {
    // The helps to ensure we don't exceed the thresholds.
    write: Arc<Semaphore>,
    read: Arc<Semaphore>,
+
+    write_total: usize,
+    read_total: usize,
 }

 impl ConcurrencyLimiter {
@@ -803,10 +825,21 @@ impl ConcurrencyLimiter {
        Arc::clone(self.for_kind(kind)).acquire_owned().await
    }

+    fn activity(&self) -> RemoteStorageActivity {
+        RemoteStorageActivity {
+            read_available: self.read.available_permits(),
+            read_total: self.read_total,
+            write_available: self.write.available_permits(),
+            write_total: self.write_total,
+        }
+    }
+
    fn new(limit: usize) -> ConcurrencyLimiter {
        Self {
            read: Arc::new(Semaphore::new(limit)),
            write: Arc::new(Semaphore::new(limit)),
+            read_total: limit,
+            write_total: limit,
        }
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;

 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
+    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
+        RemoteStorageActivity {
+            read_available: 16,
+            read_total: 16,
+            write_available: 16,
+            write_total: 16,
+        }
+    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,16 +46,15 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
-    error::Cancelled,
-    metrics::{start_counting_cancelled_wait, start_measuring_requests},
-    support::PermitCarrying,
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
+    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-use crate::metrics::AttemptOutcome;
-pub(super) use crate::metrics::RequestKind;
+pub(super) mod metrics;
+
+use self::metrics::AttemptOutcome;
+pub(super) use self::metrics::RequestKind;

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -228,7 +227,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);

@@ -249,7 +248,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);
        Ok(permit)
@@ -288,7 +287,7 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Ok,
                    started_at,
@@ -296,7 +295,7 @@ impl S3Bucket {
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Err,
                    started_at,
@@ -372,12 +371,12 @@ impl S3Bucket {
            };

            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &resp, started_at);

            let resp = resp.context("request deletion")?;
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .deleted_objects_total
                .inc_by(chunk.len() as u64);

@@ -436,14 +435,14 @@ pin_project_lite::pin_project! {
    /// Times and tracks the outcome of the request.
    struct TimedDownload<S> {
        started_at: std::time::Instant,
-        outcome: AttemptOutcome,
+        outcome: metrics::AttemptOutcome,
        #[pin]
        inner: S
    }

    impl<S> PinnedDrop for TimedDownload<S> {
        fn drop(mut this: Pin<&mut Self>) {
-            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
        }
    }
 }
@@ -452,7 +451,7 @@ impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
-            outcome: AttemptOutcome::Cancelled,
+            outcome: metrics::AttemptOutcome::Cancelled,
            inner,
        }
    }
@@ -469,8 +468,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
        let res = ready!(this.inner.poll_next(cx));
        match &res {
            Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
-            None => *this.outcome = AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
        }

        Poll::Ready(res)
@@ -544,7 +543,7 @@ impl RemoteStorage for S3Bucket {

            let started_at = ScopeGuard::into_inner(started_at);

-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &response, started_at);

@@ -626,7 +625,7 @@ impl RemoteStorage for S3Bucket {
        if let Ok(inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, inner, started_at);
        }
@@ -674,7 +673,7 @@ impl RemoteStorage for S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .req_seconds
            .observe_elapsed(kind, &res, started_at);

@@ -976,6 +975,32 @@ impl RemoteStorage for S3Bucket {
        }
        Ok(())
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
+}
+
+/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
+fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
+fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
 }

 // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -15,7 +15,6 @@ pub(crate) enum RequestKind {
    TimeTravel = 5,
 }

-use scopeguard::ScopeGuard;
 use RequestKind::*;

 impl RequestKind {
@@ -34,10 +33,10 @@ impl RequestKind {
    }
 }

-pub(crate) struct RequestTyped<C>([C; 6]);
+pub(super) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind) -> &C {
+    pub(super) fn get(&self, kind: RequestKind) -> &C {
        &self.0[kind.as_index()]
    }

@@ -59,19 +58,19 @@ impl<C> RequestTyped<C> {
 }

 impl RequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
        self.get(kind).observe(started_at.elapsed().as_secs_f64())
    }
 }

-pub(crate) struct PassFailCancelledRequestTyped<C> {
+pub(super) struct PassFailCancelledRequestTyped<C> {
    success: RequestTyped<C>,
    fail: RequestTyped<C>,
    cancelled: RequestTyped<C>,
 }

 #[derive(Debug, Clone, Copy)]
-pub(crate) enum AttemptOutcome {
+pub(super) enum AttemptOutcome {
    Ok,
    Err,
    Cancelled,
@@ -87,7 +86,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
 }

 impl AttemptOutcome {
-    pub(crate) fn as_str(&self) -> &'static str {
+    pub(super) fn as_str(&self) -> &'static str {
        match self {
            AttemptOutcome::Ok => "ok",
            AttemptOutcome::Err => "err",
@@ -97,7 +96,7 @@ impl AttemptOutcome {
 }

 impl<C> PassFailCancelledRequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
        let target = match outcome {
            AttemptOutcome::Ok => &self.success,
            AttemptOutcome::Err => &self.fail,
@@ -120,7 +119,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
 }

 impl PassFailCancelledRequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(
+    pub(super) fn observe_elapsed(
        &self,
        kind: RequestKind,
        outcome: impl Into<AttemptOutcome>,
@@ -131,44 +130,19 @@ impl PassFailCancelledRequestTyped<Histogram> {
    }
 }

-/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
-pub(crate) fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        crate::metrics::BUCKET_METRICS
-            .cancelled_waits
-            .get(kind)
-            .inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
-pub(crate) fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
-pub(crate) struct BucketMetrics {
+pub(super) struct BucketMetrics {
    /// Full request duration until successful completion, error or cancellation.
-    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
    /// Total amount of seconds waited on queue.
-    pub(crate) wait_seconds: RequestTyped<Histogram>,
+    pub(super) wait_seconds: RequestTyped<Histogram>,

    /// Track how many semaphore awaits were cancelled per request type.
    ///
    /// This is in case cancellations are happening more than expected.
-    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
+    pub(super) cancelled_waits: RequestTyped<IntCounter>,

    /// Total amount of deleted objects in batches or single requests.
-    pub(crate) deleted_objects_total: IntCounter,
+    pub(super) deleted_objects_total: IntCounter,
 }

 impl Default for BucketMetrics {
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;

 use crate::{
    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata, TimeTravelError,
+    RemoteStorageActivity, StorageMetadata, TimeTravelError,
 };

 pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.inner.activity()
+    }
 }
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -78,10 +78,6 @@ where
                let e = Err(std::io::Error::from(e));
                return Poll::Ready(Some(e));
            }
-        } else {
-            // this would be perfectly valid behaviour for doing a graceful completion on the
-            // download for example, but not one we expect to do right now.
-            tracing::warn!("continuing polling after having cancelled or timeouted");
        }

        this.inner.poll_next(cx)
@@ -93,22 +89,13 @@ where
 }

 /// Fires only on the first cancel or timeout, not on both.
-pub(crate) fn cancel_or_timeout(
+pub(crate) async fn cancel_or_timeout(
    timeout: Duration,
    cancel: CancellationToken,
-) -> impl std::future::Future<Output = TimeoutOrCancel> + 'static {
-    // futures are lazy, they don't do anything before being polled.
-    //
-    // "precalculate" the wanted deadline before returning the future, so that we can use pause
-    // failpoint to trigger a timeout in test.
-    let deadline = tokio::time::Instant::now() + timeout;
-    async move {
-        tokio::select! {
-            _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout,
-            _ = cancel.cancelled() => {
-                TimeoutOrCancel::Cancel
-            },
-        }
+) -> TimeoutOrCancel {
+    tokio::select! {
+        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
+        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
    }
 }

@@ -185,31 +172,4 @@ mod tests {
            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
        }
    }
-
-    #[tokio::test]
-    async fn notified_but_pollable_after() {
-        let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static(
-            b"hello world",
-        ))));
-        let timeout = Duration::from_secs(120);
-        let cancel = CancellationToken::new();
-
-        cancel.cancel();
-        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
-        let mut stream = std::pin::pin!(stream);
-
-        let next = stream.next().await;
-        let ioe = next.unwrap().unwrap_err();
-        assert!(
-            matches!(
-                ioe.get_ref().unwrap().downcast_ref::<DownloadError>(),
-                Some(&DownloadError::Cancelled)
-            ),
-            "{ioe:?}"
-        );
-
-        let next = stream.next().await;
-        let bytes = next.unwrap().unwrap();
-        assert_eq!(&b"hello world"[..], bytes);
-    }
 }
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -7,7 +7,7 @@ license.workspace = true
 [dependencies]
 hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
-opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
 reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -9,33 +9,6 @@ use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-/// Declare a failpoint that can use the `pause` failpoint action.
-/// We don't want to block the executor thread, hence, spawn_blocking + await.
-#[macro_export]
-macro_rules! pausable_failpoint {
-    ($name:literal) => {
-        if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
-                move || {
-                    let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
-                }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
-            }
-        }
-    };
-}
-
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,9 +3,6 @@ use std::{fs, io, path::Path};

 use anyhow::Context;

-mod rename_noreplace;
-pub use rename_noreplace::rename_noreplace;
-
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -1,109 +0,0 @@
-use nix::NixPath;
-
-/// Rename a file without replacing an existing file.
-///
-/// This is a wrapper around platform-specific APIs.
-pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
-    src: &P1,
-    dst: &P2,
-) -> nix::Result<()> {
-    {
-        #[cfg(target_os = "linux")]
-        {
-            nix::fcntl::renameat2(
-                None,
-                src,
-                None,
-                dst,
-                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
-            )
-        }
-        #[cfg(target_os = "macos")]
-        {
-            let res = src.with_nix_path(|src| {
-                dst.with_nix_path(|dst|
-                    // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np.
-                    unsafe {
-                        nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL)
-                })
-            })??;
-            nix::errno::Errno::result(res).map(drop)
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            std::compile_error!("OS does not support no-replace renames");
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::{fs, path::PathBuf};
-
-    use super::*;
-
-    fn testdir() -> camino_tempfile::Utf8TempDir {
-        match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") {
-            Some(path) => {
-                let path: camino::Utf8PathBuf = path;
-                camino_tempfile::tempdir_in(path).unwrap()
-            }
-            None => camino_tempfile::tempdir().unwrap(),
-        }
-    }
-
-    #[test]
-    fn test_absolute_paths() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        let src = testdir.path().join("src");
-        let dst = testdir.path().join("dst");
-
-        fs::write(&src, b"").unwrap();
-        fs::write(&dst, b"").unwrap();
-
-        let src = src.canonicalize().unwrap();
-        assert!(src.is_absolute());
-        let dst = dst.canonicalize().unwrap();
-        assert!(dst.is_absolute());
-
-        let result = rename_noreplace(&src, &dst);
-        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
-    }
-
-    #[test]
-    fn test_relative_paths() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        // this is fine because we run in nextest => process per test
-        std::env::set_current_dir(testdir.path()).unwrap();
-
-        let src = PathBuf::from("src");
-        let dst = PathBuf::from("dst");
-
-        fs::write(&src, b"").unwrap();
-        fs::write(&dst, b"").unwrap();
-
-        let result = rename_noreplace(&src, &dst);
-        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
-    }
-
-    #[test]
-    fn test_works_when_not_exists() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        let src = testdir.path().join("src");
-        let dst = testdir.path().join("dst");
-
-        fs::write(&src, b"content").unwrap();
-
-        rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap();
-        assert_eq!(
-            "content",
-            String::from_utf8(std::fs::read(&dst).unwrap()).unwrap()
-        );
-    }
-}
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -19,13 +19,13 @@
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
-pub struct Hex<S>(pub S);
+#[derive(PartialEq)]
+pub struct Hex<'a>(pub &'a [u8]);

-impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
+impl std::fmt::Debug for Hex<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[")?;
-        let chunks = self.0.as_ref().chunks(16);
-        for (i, c) in chunks.enumerate() {
+        for (i, c) in self.0.chunks(16).enumerate() {
            if i > 0 && !c.is_empty() {
                writeln!(f, ", ")?;
            }
@@ -36,15 +36,6 @@ impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
                write!(f, "0x{b:02x}")?;
            }
        }
-        write!(f, "; {}]", self.0.as_ref().len())
-    }
-}
-
-impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
-    fn eq(&self, other: &Hex<R>) -> bool {
-        let left = self.0.as_ref();
-        let right = other.0.as_ref();
-
-        left == right
+        write!(f, "; {}]", self.0.len())
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -34,9 +34,6 @@ pub enum ApiError {
    #[error("Timeout")]
    Timeout(Cow<'static, str>),

-    #[error("Request cancelled")]
-    Cancelled,
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -77,10 +74,6 @@ impl ApiError {
                err.to_string(),
                StatusCode::REQUEST_TIMEOUT,
            ),
-            ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
-                self.to_string(),
-                StatusCode::INTERNAL_SERVER_ERROR,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
@@ -140,7 +133,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
-        ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
        _ => info!("Error processing HTTP request: {api_error:#}"),
    }

--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,8 +135,7 @@ impl Gate {
        let started_at = std::time::Instant::now();
        let mut do_close = std::pin::pin!(self.do_close());

-        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
-        let nag_after = Duration::from_millis(100);
+        let nag_after = Duration::from_secs(1);

        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
            return;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
    }
    fn file_size(&self) -> u64 {
        match self {
-            MockLayer::Delta(this) => this.file_size,
-            MockLayer::Image(this) => this.file_size,
+            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size(),
        }
    }
    fn short_id(&self) -> String {
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,7 +17,6 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
-thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,6 +1,11 @@
+use std::collections::HashMap;
+
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::IndexPart;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
+use utils::lsn::Lsn;

 #[derive(clap::Subcommand)]
 pub(crate) enum IndexPartCmd {
@@ -12,7 +17,20 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
        IndexPartCmd::Dump { path } => {
            let bytes = tokio::fs::read(path).await.context("read file")?;
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
-            let output = serde_json::to_string_pretty(&des).context("serialize output")?;
+            #[derive(serde::Serialize)]
+            struct Output<'a> {
+                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
+                disk_consistent_lsn: Lsn,
+                timeline_metadata: &'a TimelineMetadata,
+            }
+
+            let output = Output {
+                layer_metadata: &des.layer_metadata,
+                disk_consistent_lsn: des.get_disk_consistent_lsn(),
+                timeline_metadata: &des.metadata,
+            };
+
+            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
            println!("{output}");
            Ok(())
        }
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -1,475 +0,0 @@
-use anyhow::Context;
-use clap::Parser;
-use pageserver_api::{
-    key::Key,
-    reltag::{BlockNumber, RelTag, SlruKind},
-    shard::{ShardCount, ShardStripeSize},
-};
-use std::str::FromStr;
-
-#[derive(Parser)]
-pub(super) struct DescribeKeyCommand {
-    /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
-    input: Vec<String>,
-
-    /// The number of shards to calculate what Keys placement would be.
-    #[arg(long)]
-    shard_count: Option<CustomShardCount>,
-
-    /// The sharding stripe size.
-    ///
-    /// The default is hardcoded. It makes no sense to provide this without providing
-    /// `--shard-count`.
-    #[arg(long, requires = "shard_count")]
-    stripe_size: Option<u32>,
-}
-
-/// Sharded shard count without unsharded count, which the actual ShardCount supports.
-#[derive(Clone, Copy)]
-pub(super) struct CustomShardCount(std::num::NonZeroU8);
-
-#[derive(Debug, thiserror::Error)]
-pub(super) enum InvalidShardCount {
-    #[error(transparent)]
-    ParsingFailed(#[from] std::num::ParseIntError),
-    #[error("too few shards")]
-    TooFewShards,
-}
-
-impl FromStr for CustomShardCount {
-    type Err = InvalidShardCount;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let inner: std::num::NonZeroU8 = s.parse()?;
-        if inner.get() < 2 {
-            Err(InvalidShardCount::TooFewShards)
-        } else {
-            Ok(CustomShardCount(inner))
-        }
-    }
-}
-
-impl From<CustomShardCount> for ShardCount {
-    fn from(value: CustomShardCount) -> Self {
-        ShardCount::new(value.0.get())
-    }
-}
-
-impl DescribeKeyCommand {
-    pub(super) fn execute(self) {
-        let DescribeKeyCommand {
-            input,
-            shard_count,
-            stripe_size,
-        } = self;
-
-        let material = KeyMaterial::try_from(input.as_slice()).unwrap();
-        let kind = material.kind();
-        let key = Key::from(material);
-
-        println!("parsed from {kind}: {key}:");
-        println!();
-        println!("{key:?}");
-
-        macro_rules! kind_query {
-            ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
-            ($name:ident) => {{
-                let s: &'static str = stringify!($name);
-                let s = s.strip_prefix("is_").unwrap_or(s);
-                let s = s.strip_suffix("_key").unwrap_or(s);
-
-                #[allow(clippy::needless_borrow)]
-                (s, key.$name())
-            }};
-        }
-
-        // the current characterization is a mess of these boolean queries and separate
-        // "recognization". I think it accurately represents how strictly we model the Key
-        // right now, but could of course be made less confusing.
-
-        let queries = kind_query!([
-            is_rel_block_key,
-            is_rel_vm_block_key,
-            is_rel_fsm_block_key,
-            is_slru_block_key,
-            is_inherited_key,
-            is_rel_size_key,
-            is_slru_segment_size_key,
-        ]);
-
-        let recognized_kind = "recognized kind";
-        let metadata_key = "metadata key";
-        let shard_placement = "shard placement";
-
-        let longest = queries
-            .iter()
-            .map(|t| t.0)
-            .chain([recognized_kind, metadata_key, shard_placement])
-            .map(|s| s.len())
-            .max()
-            .unwrap();
-
-        let colon = 1;
-        let padding = 1;
-
-        for (name, is) in queries {
-            let width = longest - name.len() + colon + padding;
-            println!("{}{:width$}{}", name, ":", is);
-        }
-
-        let width = longest - recognized_kind.len() + colon + padding;
-        println!(
-            "{}{:width$}{:?}",
-            recognized_kind,
-            ":",
-            RecognizedKeyKind::new(key),
-        );
-
-        if let Some(shard_count) = shard_count {
-            // seeing the sharding placement might be confusing, so leave it out unless shard
-            // count was given.
-
-            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
-            println!(
-                "# placement with shard_count: {} and stripe_size: {}:",
-                shard_count.0, stripe_size.0
-            );
-            let width = longest - shard_placement.len() + colon + padding;
-            println!(
-                "{}{:width$}{:?}",
-                shard_placement,
-                ":",
-                pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
-            );
-        }
-    }
-}
-
-/// Hand-wavy "inputs we accept" for a key.
-#[derive(Debug)]
-pub(super) enum KeyMaterial {
-    Hex(Key),
-    String(SpanAttributesFromLogs),
-    Split(RelTag, BlockNumber),
-}
-
-impl KeyMaterial {
-    fn kind(&self) -> &'static str {
-        match self {
-            KeyMaterial::Hex(_) => "hex",
-            KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
-        }
-    }
-}
-
-impl From<KeyMaterial> for Key {
-    fn from(value: KeyMaterial) -> Self {
-        match value {
-            KeyMaterial::Hex(key) => key,
-            KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
-            | KeyMaterial::Split(rt, blocknum) => {
-                pageserver_api::key::rel_block_to_key(rt, blocknum)
-            }
-        }
-    }
-}
-
-impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
-    type Error = anyhow::Error;
-
-    fn try_from(value: &[S]) -> Result<Self, Self::Error> {
-        match value {
-            [] => anyhow::bail!(
-                "need 1..N positional arguments describing the key, try hex or a log line"
-            ),
-            [one] => {
-                let one = one.as_ref();
-
-                let key = Key::from_hex(one).map(KeyMaterial::Hex);
-
-                let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
-
-                match (key, attrs) {
-                    (Ok(key), _) => Ok(key),
-                    (_, Ok(s)) => Ok(s),
-                    (Err(e1), Err(e2)) => anyhow::bail!(
-                        "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
-                    ),
-                }
-            }
-            more => {
-                // assume going left to right one of these is a reltag and then we find a blocknum
-                // this works, because we don't have plain numbers at least right after reltag in
-                // logs. for some definition of "works".
-
-                let Some((reltag_at, reltag)) = more
-                    .iter()
-                    .map(AsRef::as_ref)
-                    .enumerate()
-                    .find_map(|(i, s)| {
-                        s.split_once("rel=")
-                            .map(|(_garbage, actual)| actual)
-                            .unwrap_or(s)
-                            .parse::<RelTag>()
-                            .ok()
-                            .map(|rt| (i, rt))
-                    })
-                else {
-                    anyhow::bail!("found no RelTag in arguments");
-                };
-
-                let Some(blocknum) = more
-                    .iter()
-                    .map(AsRef::as_ref)
-                    .skip(reltag_at)
-                    .find_map(|s| {
-                        s.split_once("blkno=")
-                            .map(|(_garbage, actual)| actual)
-                            .unwrap_or(s)
-                            .parse::<BlockNumber>()
-                            .ok()
-                    })
-                else {
-                    anyhow::bail!("found no blocknum in arguments");
-                };
-
-                Ok(KeyMaterial::Split(reltag, blocknum))
-            }
-        }
-    }
-}
-
-#[derive(Debug)]
-pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
-
-impl std::str::FromStr for SpanAttributesFromLogs {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // accept the span separator but do not require or fail if either is missing
-        // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
-        let (_, reltag) = s
-            .split_once("rel=")
-            .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
-        let reltag = reltag.split_whitespace().next().unwrap();
-
-        let (_, blocknum) = s
-            .split_once("blkno=")
-            .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
-        let blocknum = blocknum.split_whitespace().next().unwrap();
-
-        let reltag = reltag
-            .parse()
-            .with_context(|| format!("parse reltag from {reltag:?}"))?;
-        let blocknum = blocknum
-            .parse()
-            .with_context(|| format!("parse blocknum from {blocknum:?}"))?;
-
-        Ok(Self(reltag, blocknum))
-    }
-}
-
-#[derive(Debug)]
-#[allow(dead_code)] // debug print is used
-enum RecognizedKeyKind {
-    DbDir,
-    ControlFile,
-    Checkpoint,
-    AuxFilesV1,
-    SlruDir(Result<SlruKind, u32>),
-    RelMap(RelTagish<2>),
-    RelDir(RelTagish<2>),
-    AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
-}
-
-#[derive(Debug, PartialEq)]
-#[allow(unused)]
-enum AuxFileV2 {
-    Recognized(&'static str, utils::Hex<[u8; 13]>),
-    OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
-    Other(utils::Hex<[u8; 13]>),
-}
-
-impl RecognizedKeyKind {
-    fn new(key: Key) -> Option<Self> {
-        use RecognizedKeyKind::{
-            AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
-        };
-
-        let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
-
-        Some(match key {
-            pageserver_api::key::DBDIR_KEY => DbDir,
-            pageserver_api::key::CONTROLFILE_KEY => ControlFile,
-            pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
-            pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
-            _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
-            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
-                RelMap([key.field2, key.field3].into())
-            }
-            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
-                RelDir([key.field2, key.field3].into())
-            }
-            _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
-                AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
-            ),
-            _ => return None,
-        })
-    }
-}
-
-impl AuxFileV2 {
-    fn new(key: Key) -> Option<AuxFileV2> {
-        const EMPTY_HASH: [u8; 13] = {
-            let mut out = [0u8; 13];
-            let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
-            let mut i = 3;
-            while i < 16 {
-                out[i - 3] = hash[i];
-                i += 1;
-            }
-            out
-        };
-
-        let bytes = key.to_i128().to_be_bytes();
-        let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
-
-        assert_eq!(EMPTY_HASH.len(), hash.0.len());
-
-        // TODO: we could probably find the preimages for the hashes
-
-        Some(match (bytes[1], bytes[2]) {
-            (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
-            (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
-            (1, 3) if hash.0 == EMPTY_HASH => {
-                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
-            }
-            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
-            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
-            (0xff, 0xff) => AuxFileV2::Other(hash),
-            _ => return None,
-        })
-    }
-}
-
-/// Prefix of RelTag, currently only known use cases are the two item versions.
-///
-/// Renders like a reltag with `/`, nothing else.
-struct RelTagish<const N: usize>([u32; N]);
-
-impl<const N: usize> From<[u32; N]> for RelTagish<N> {
-    fn from(val: [u32; N]) -> Self {
-        RelTagish(val)
-    }
-}
-
-impl<const N: usize> std::fmt::Debug for RelTagish<N> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use std::fmt::Write as _;
-        let mut first = true;
-        self.0.iter().try_for_each(|x| {
-            if !first {
-                f.write_char('/')?;
-            }
-            first = false;
-            write!(f, "{}", x)
-        })
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver::aux_file::encode_aux_file_key;
-
-    use super::*;
-
-    #[test]
-    fn hex_is_key_material() {
-        let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
-        assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
-    }
-
-    #[test]
-    fn single_positional_spanalike_is_key_material() {
-        // why is this needed? if you are checking many, then copypaste starts to appeal
-        let strings = [
-            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
-            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
-            (line!(), "rel=1663/208101/2620.1 blkno=2"),
-        ];
-
-        let mut first: Option<Key> = None;
-
-        for (line, example) in strings {
-            let m = KeyMaterial::try_from(&[example][..])
-                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
-            let key = Key::from(m);
-            if let Some(first) = first {
-                assert_eq!(first, key);
-            } else {
-                first = Some(key);
-            }
-        }
-
-        // not supporting this is rather accidential, but I think the input parsing is lenient
-        // enough already
-        KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
-    }
-
-    #[test]
-    fn multiple_spanlike_args() {
-        let strings = [
-            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
-            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
-            (line!(), &["1663/208101/2620_fsm", "2"][..]),
-        ];
-
-        let mut first: Option<Key> = None;
-
-        for (line, example) in strings {
-            let m = KeyMaterial::try_from(example)
-                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
-            let key = Key::from(m);
-            if let Some(first) = first {
-                assert_eq!(first, key);
-            } else {
-                first = Some(key);
-            }
-        }
-    }
-    #[test]
-    fn recognized_auxfiles() {
-        use AuxFileV2::*;
-
-        let empty = [
-            0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
-        ];
-        let foobar = [
-            0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
-        ];
-
-        #[rustfmt::skip]
-        let examples = [
-            (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
-            (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
-            (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
-            (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
-            (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
-            (line!(), "foobar", Other(utils::Hex(foobar))),
-        ];
-
-        for (line, path, expected) in examples {
-            let key = encode_aux_file_key(path);
-            let recognized =
-                AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
-
-            assert_eq!(recognized, expected);
-        }
-
-        assert_eq!(
-            AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
-            None,
-            "example key has one too few 0 after 6 before 1"
-        );
-    }
-}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -6,7 +6,6 @@

 mod draw_timeline_dir;
 mod index_part;
-mod key;
 mod layer_map_analyzer;
 mod layers;

@@ -62,8 +61,6 @@ enum Commands {
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
    #[command(subcommand)]
    Layer(LayerCmd),
-    /// Debug print a hex key found from logs
-    Key(key::DescribeKeyCommand),
 }

 /// Read and update pageserver metadata file
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                .await?;
        }
-        Commands::Key(dkc) => dkc.execute(),
    };
    Ok(())
 }
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -5,7 +5,6 @@ use utils::lsn::Lsn;

 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Instant;

 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
@@ -89,17 +88,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        println!("ingested {file_cnt} files");
    }

-    for _ in 0..100 {
-        let start = Instant::now();
-        let files = mgmt_api_client
-            .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-            .await?;
-        println!(
-            "{} files found in {}s",
-            files.len(),
-            start.elapsed().as_secs_f64()
-        );
-    }
+    let files = mgmt_api_client
+        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+        .await?;
+
+    println!("{} files found", files.len());

    anyhow::Ok(())
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver_api::key::Key;
+use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;

@@ -187,7 +187,7 @@ async fn main_impl(
                    for r in partitioning.keys.ranges.iter() {
                        let mut i = r.start;
                        while i != r.end {
-                            if i.is_rel_block_key() {
+                            if is_rel_block_key(&i) {
                                filtered.add_key(i);
                            }
                            i = i.next();
@@ -308,10 +308,9 @@ async fn main_impl(
                    let r = &ranges[weights.sample(&mut rng)];
                    let key: i128 = rng.gen_range(r.start..r.end);
                    let key = Key::from_i128(key);
-                    assert!(key.is_rel_block_key());
-                    let (rel_tag, block_no) = key
-                        .to_rel_block()
-                        .expect("we filter non-rel-block keys out above");
+                    assert!(is_rel_block_key(&key));
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
                            Lsn::MAX
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -178,8 +178,7 @@ impl AuxFileSizeEstimator {
        }
    }

-    /// When generating base backup or doing initial logical size calculation
-    pub fn on_initial(&self, new_size: usize) {
+    pub fn on_base_backup(&self, new_size: usize) {
        let mut guard = self.size.lock().unwrap();
        *guard = Some(new_size as isize);
        self.report(new_size as isize);
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::Key;
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -170,7 +170,7 @@ where
    }

    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
-        let (kind, segno, _) = key.to_slru_block()?;
+        let (kind, segno, _) = key_to_slru_block(*key)?;

        match kind {
            SlruKind::Clog => {
@@ -362,13 +362,6 @@ where
                    ));
                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                } else if path == "pg_logical/replorigin_checkpoint" {
-                    // replorigin_checkoint is written only on compute shutdown, so it contains
-                    // deteriorated values. So we generate our own version of this file for the particular LSN
-                    // based on information about replorigins extracted from transaction commit records.
-                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                    // but now we should handle (skip) it for backward compatibility.
-                    continue;
                }
                let header = new_tar_header(&path, content.len() as u64)?;
                self.ar
@@ -397,32 +390,6 @@ where
        {
            self.add_twophase_file(xid).await?;
        }
-        let repl_origins = self
-            .timeline
-            .get_replorigins(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
-        let n_origins = repl_origins.len();
-        if n_origins != 0 {
-            //
-            // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins
-            // extracted from transaction commit record. We are using this file to pass information about replication
-            // origins to compute to allow logical replication to restart from proper point.
-            //
-            let mut content = Vec::with_capacity(n_origins * 16 + 8);
-            content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
-            for (origin_id, origin_lsn) in repl_origins {
-                content.extend_from_slice(&origin_id.to_le_bytes());
-                content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
-                content.extend_from_slice(&origin_lsn.0.to_le_bytes());
-            }
-            let crc32 = crc32c::crc32c(&content);
-            content.extend_from_slice(&crc32.to_le_bytes());
-            let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
-            self.ar.append(&header, &*content).await.context(
-                "could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
-            )?;
-        }

        fail_point!("basebackup-before-control-file", |_| {
            Err(BasebackupError::Server(anyhow!(
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,6 +99,8 @@ pub mod defaults {

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -144,6 +146,8 @@ pub mod defaults {

 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'

+#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -296,6 +300,8 @@ pub struct PageServerConf {
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub walredo_process_kind: crate::walredo::ProcessKind,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +407,8 @@ struct PageServerConfigBuilder {
    validate_vectored_get: BuilderValue<bool>,

    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }

 impl PageServerConfigBuilder {
@@ -489,6 +497,8 @@ impl PageServerConfigBuilder {
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+
+            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
        }
    }
 }
@@ -676,6 +686,10 @@ impl PageServerConfigBuilder {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }

+    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
+        self.walredo_process_kind = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -733,6 +747,7 @@ impl PageServerConfigBuilder {
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
+                walredo_process_kind,
            }
            CUSTOM LOGIC
            {
@@ -1029,6 +1044,9 @@ impl PageServerConf {
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
+                "walredo_process_kind" => {
+                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1112,6 +1130,7 @@ impl PageServerConf {
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
        }
    }
 }
@@ -1351,6 +1370,7 @@ background_task_maximum_delay = '334 s'
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1424,6 +1444,7 @@ background_task_maximum_delay = '334 s'
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,9 +2,10 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{
+    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
+};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -349,12 +350,19 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // Same for the loop that fetches computed metrics.
    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
    // which turns out is really handy to understand the system.
-    match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
-        Ok(_) => {}
-        Err(CalculateSyntheticSizeError::Cancelled) => {}
-        Err(e) => {
-            let tenant_shard_id = tenant.tenant_shard_id();
-            error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
-        }
+    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
+        return;
+    };
+
+    // this error can be returned if timeline is shutting down, but it does not
+    // mean the synthetic size worker should terminate.
+    let shutting_down = matches!(
+        e.downcast_ref::<PageReconstructError>(),
+        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+    );
+
+    if !shutting_down {
+        let tenant_shard_id = tenant.tenant_shard_id();
+        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -311,7 +311,7 @@ impl DeletionList {
                result.extend(
                    timeline_layers
                        .into_iter()
-                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
                );
            }
        }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    });
                }
                EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size;
+                    let file_size = layer.metadata.file_size();

                    js.spawn(async move {
                        layer
@@ -641,7 +641,7 @@ impl EvictionLayer {
    pub(crate) fn get_file_size(&self) -> u64 {
        match self {
            Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size,
+            Self::Secondary(sl) => sl.metadata.file_size(),
        }
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -81,10 +81,8 @@ paths:
        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
-        "200":
-          description: Tenant was successfully deleted, or was already not found.
        "404":
-          description: Tenant not found. This is a success result, equivalent to 200.
+          description: Tenant not found. This is the success path.
          content:
            application/json:
              schema:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,7 +74,6 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
-use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
@@ -181,7 +180,12 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::MissingKey(e) => {
                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
            }
-            PageReconstructError::Cancelled => ApiError::Cancelled,
+            PageReconstructError::Cancelled => {
+                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+            }
+            PageReconstructError::AncestorStopping(_) => {
+                ApiError::ResourceUnavailable(format!("{pre}").into())
+            }
            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
@@ -1071,7 +1075,7 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    let status = state
+    state
        .tenant_manager
        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
        .instrument(info_span!("tenant_delete_handler",
@@ -1080,14 +1084,7 @@ async fn tenant_delete_handler(
        ))
        .await?;

-    // Callers use 404 as success for deletions, for historical reasons.
-    if status == StatusCode::NOT_FOUND {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Deletion complete").into(),
-        ));
-    }
-
-    json_response(status, ())
+    json_response(StatusCode::ACCEPTED, ())
 }

 /// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1135,10 +1132,7 @@ async fn tenant_size_handler(
            &ctx,
        )
        .await
-        .map_err(|e| match e {
-            crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })?;
+        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
    let accepts_html = headers
@@ -1146,7 +1140,9 @@ async fn tenant_size_handler(
        .map(|v| v == "text/html")
        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
-        let storage_model = inputs.calculate_model();
+        let storage_model = inputs
+            .calculate_model()
+            .map_err(ApiError::InternalServerError)?;
        let size = storage_model.calculate();

        // If request header expects html, return html
@@ -1817,22 +1813,11 @@ async fn timeline_checkpoint_handler(
        timeline
            .freeze_and_flush()
            .await
-            .map_err(|e| {
-                match e {
-                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
-                    other => ApiError::InternalServerError(other.into()),
-
-                }
-            })?;
+            .map_err(ApiError::InternalServerError)?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
-            .map_err(|e|
-                match e {
-                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                    CompactionError::Other(e) => ApiError::InternalServerError(e)
-                }
-            )?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        if wait_until_uploaded {
            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
@@ -2188,7 +2173,7 @@ async fn tenant_scan_remote_handler(
            {
                Ok((index_part, index_generation)) => {
                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
+                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
                    generation = std::cmp::max(generation, index_generation);
                }
                Err(DownloadError::NotFound) => {
@@ -2430,25 +2415,6 @@ async fn list_aux_files(
    json_response(StatusCode::OK, files)
 }

-async fn perf_info(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let state = get_state(&request);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-
-    let result = timeline.perf_info().await;
-
-    json_response(StatusCode::OK, result)
-}
-
 async fn ingest_aux_files(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -2876,9 +2842,5 @@ pub fn make_router(
            |r| testing_api_handler("list_aux_files", r, list_aux_files),
        )
        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
-            |r| testing_api_handler("perf_info", r, perf_info),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2108,7 +2108,6 @@ pub(crate) struct TimelineMetrics {
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
-    shutdown: std::sync::atomic::AtomicBool,
 }

 impl TimelineMetrics {
@@ -2228,7 +2227,6 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
-            shutdown: std::sync::atomic::AtomicBool::default(),
        }
    }

@@ -2251,17 +2249,6 @@ impl TimelineMetrics {
    }

    pub(crate) fn shutdown(&self) {
-        let was_shutdown = self
-            .shutdown
-            .swap(true, std::sync::atomic::Ordering::Relaxed);
-
-        if was_shutdown {
-            // this happens on tenant deletion because tenant first shuts down timelines, then
-            // invokes timeline deletion which first shuts down the timeline again.
-            // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
-            return;
-        }
-
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -66,7 +66,6 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -261,8 +260,6 @@ async fn page_service_conn_main(
    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

-    fail::fail_point!("ps::connection-start::pre-login");
-
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -373,7 +370,7 @@ impl From<WaitLsnError> for PageStreamError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
            WaitLsnError::Shutdown => Self::Shutdown,
-            e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
+            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
        }
    }
 }
@@ -383,7 +380,7 @@ impl From<WaitLsnError> for QueryError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState { .. } => Self::Reconnect,
+            WaitLsnError::BadState => Self::Reconnect,
        }
    }
 }
@@ -606,7 +603,6 @@ impl PageServerHandler {
            };

            trace!("query: {copy_data_bytes:?}");
-            fail::fail_point!("ps::handle-pagerequest-message");

            // Trace request if needed
            if let Some(t) = tracer.as_mut() {
@@ -621,7 +617,6 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -631,7 +626,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -641,7 +635,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
@@ -652,7 +645,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -662,7 +654,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -831,10 +822,7 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
+        timeline.freeze_and_flush().await?;

        info!("done");
        Ok(())
@@ -1517,7 +1505,6 @@ where
        _pgb: &mut PostgresBackend<IO>,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
-        fail::fail_point!("ps::connection-start::startup-packet");
        Ok(())
    }

@@ -1532,8 +1519,6 @@ where
            Err(QueryError::SimulatedConnectionError)
        });

-        fail::fail_point!("ps::connection-start::process-query");
-
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
        let parts = query_string.split_whitespace().collect::<Vec<_>>();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -17,8 +17,8 @@ use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
 use pageserver_api::key::{
-    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
+    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
+    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
@@ -27,7 +27,7 @@ use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
-use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
+use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
@@ -36,7 +36,6 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
-use utils::pausable_failpoint;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -79,19 +78,11 @@ pub enum LsnForTimestamp {
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum CalculateLogicalSizeError {
+pub enum CalculateLogicalSizeError {
    #[error("cancelled")]
    Cancelled,
-
-    /// Something went wrong while reading the metadata we use to calculate logical size
-    /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
-    /// in the `From` implementation for this variant.
    #[error(transparent)]
-    PageRead(PageReconstructError),
-
-    /// Something went wrong deserializing metadata that we read to calculate logical size
-    #[error("decode error: {0}")]
-    Decode(#[from] DeserializeError),
+    Other(#[from] anyhow::Error),
 }

 #[derive(Debug, thiserror::Error)]
@@ -116,8 +107,10 @@ impl From<PageReconstructError> for CollectKeySpaceError {
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
-            PageReconstructError::Cancelled => Self::Cancelled,
-            _ => Self::PageRead(pre),
+            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
+                Self::Cancelled
+            }
+            _ => Self::Other(pre.into()),
        }
    }
 }
@@ -410,8 +403,6 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<LsnForTimestamp, PageReconstructError> {
-        pausable_failpoint!("find-lsn-for-timestamp-pausable");
-
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
        // We use this method to figure out the branching LSN for the new branch, but the
        // GC cutoff could be before the branching point and we cannot create a new branch
@@ -427,7 +418,6 @@ impl Timeline {

        let mut found_smaller = false;
        let mut found_larger = false;
-
        while low < high {
            if cancel.is_cancelled() {
                return Err(PageReconstructError::Cancelled);
@@ -722,22 +712,10 @@ impl Timeline {
                result.insert(fname, content);
            }
        }
-        self.aux_file_size_estimator.on_initial(sz);
+        self.aux_file_size_estimator.on_base_backup(sz);
        Ok(result)
    }

-    pub(crate) async fn trigger_aux_file_size_computation(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
-            self.list_aux_files_v2(lsn, ctx).await?;
-        }
-        Ok(())
-    }
-
    pub(crate) async fn list_aux_files(
        &self,
        lsn: Lsn,
@@ -776,27 +754,6 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_replorigins(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
-        let kv = self
-            .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
-        let mut result = HashMap::new();
-        for (k, v) in kv {
-            let v = v.context("get value")?;
-            let origin_id = k.field6 as RepOriginId;
-            let origin_lsn = Lsn::des(&v).unwrap();
-            if origin_lsn != Lsn::INVALID {
-                result.insert(origin_id, origin_lsn);
-            }
-        }
-        Ok(result)
-    }
-
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -806,7 +763,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn get_current_logical_size_non_incremental(
+    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -815,7 +772,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
@@ -919,20 +876,10 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

-        #[cfg(test)]
-        {
-            let guard = self.extra_test_dense_keyspace.load();
-            for kr in &guard.ranges {
-                result.add_range(kr.clone());
-            }
-        }
-
        Ok((
            result.to_keyspace(),
            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
+            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
        ))
    }

@@ -1201,20 +1148,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub async fn set_replorigin(
-        &mut self,
-        origin_id: RepOriginId,
-        origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
-        let key = repl_origin_key(origin_id);
-        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
-        Ok(())
-    }
-
-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
-        self.set_replorigin(origin_id, Lsn::INVALID).await
-    }
-
    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
        self.put(CONTROLFILE_KEY, Value::Image(img));
        Ok(())
@@ -1619,7 +1552,7 @@ impl<'a> DatadirModification<'a> {
                    self.tline.aux_file_size_estimator.on_add(content.len());
                    new_files.push((path, content));
                }
-                (None, true) => warn!("removing non-existing aux file: {}", path),
+                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
            }
            let new_val = aux_file::encode_file_value(&new_files)?;
            self.put(key, Value::Image(new_val.into()));
@@ -1673,7 +1606,8 @@ impl<'a> DatadirModification<'a> {
                        aux_files.dir = Some(dir);
                    }
                    Err(
-                        e @ (PageReconstructError::Cancelled
+                        e @ (PageReconstructError::AncestorStopping(_)
+                        | PageReconstructError::Cancelled
                        | PageReconstructError::AncestorLsnTimeout(_)),
                    ) => {
                        // Important that we do not interpret a shutdown error as "not found" and thereby
@@ -1745,7 +1679,7 @@ impl<'a> DatadirModification<'a> {
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
            for (lsn, value) in values {
-                if key.is_rel_block_key() || key.is_slru_block_key() {
+                if is_rel_block_key(&key) || is_slru_block_key(key) {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
                    writer.put(key, lsn, &value, ctx).await?;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -8,7 +8,7 @@ use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, Instrument};

-use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
+use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};

 use crate::{
    config::PageServerConf,
@@ -16,7 +16,6 @@ use crate::{
    task_mgr::{self, TaskKind},
    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
-        remote_timeline_client::remote_heatmap_path,
        timeline::ShutdownMode,
    },
 };
@@ -532,25 +531,6 @@ impl DeleteTenantFlow {
            }
        }

-        // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
-        let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
-        if let Some(Err(e)) = backoff::retry(
-            || async {
-                remote_storage
-                    .delete(&heatmap_path, &task_mgr::shutdown_token())
-                    .await
-            },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_remote_tenant_heatmap",
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        {
-            tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
-        }
-
        let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
        if timelines_path.exists() {
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,23 +1,15 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
-//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
-//! this struct and it's original serialization format is still needed because they were written a
-//! long time ago.
+//! Every image of a certain timeline from [`crate::tenant::Tenant`]
+//! has a metadata that needs to be stored persistently.
 //!
-//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
-//! versioning.
+//! Later, the file gets used in [`remote_timeline_client`] as a part of
+//! external storage import and export operations.
 //!
-//! To clean up this module we need to migrate all index_part.json files to a later version.
-//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
-//! however long we keep the old versions to be able to delete the old code. After that, we can
-//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
-//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
-//! compatibility.
+//! The module contains all structs and related helper methods related to timeline metadata.
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client
-//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart

 use anyhow::ensure;
-use serde::{Deserialize, Serialize};
+use serde::{de::Error, Deserialize, Serialize, Serializer};
 use utils::bin_ser::SerializeError;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};

@@ -25,37 +17,17 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 const METADATA_FORMAT_VERSION: u16 = 4;

 /// Previous supported format versions.
-///
-/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
-/// that requires a scrubber run which is yet to be done.
 const METADATA_OLD_FORMAT_VERSION: u16 = 3;

-/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
+/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
 ///
 /// This is the same assumption that PostgreSQL makes with the control file,
-///
 /// see PG_CONTROL_MAX_SAFE_SIZE
 const METADATA_MAX_SIZE: usize = 512;

-/// Legacy metadata stored as a component of `index_part.json` per timeline.
+/// Metadata stored on disk for each timeline
 ///
-/// Do not make new changes to this type or the module. In production, we have two different kinds
-/// of serializations of this type: bincode and json. Bincode version reflects what used to be
-/// stored on disk in earlier versions and does internal crc32 checksumming.
-///
-/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
-/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
-/// as-exists in `index_part.json` ([`self::modern_serde`]).
-///
-/// ```compile_fail
-/// #[derive(serde::Serialize)]
-/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
-/// ```
-///
-/// ```compile_fail
-/// #[derive(serde::Deserialize)]
-/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
-/// ```
+/// The fields correspond to the values we hold in memory, in Timeline.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TimelineMetadata {
    hdr: TimelineMetadataHeader,
@@ -68,49 +40,6 @@ struct TimelineMetadataHeader {
    size: u16,           // size of serialized metadata
    format_version: u16, // metadata format version (used for compatibility checks)
 }
-
-impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
-    type Error = Crc32CalculationFailed;
-
-    fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
-        #[derive(Default)]
-        struct Crc32Sink {
-            crc: u32,
-            count: usize,
-        }
-
-        impl std::io::Write for Crc32Sink {
-            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-                self.crc = crc32c::crc32c_append(self.crc, buf);
-                self.count += buf.len();
-                Ok(buf.len())
-            }
-
-            fn flush(&mut self) -> std::io::Result<()> {
-                Ok(())
-            }
-        }
-
-        // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
-        // across serialization versions
-        let mut sink = Crc32Sink::default();
-        <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
-            .map_err(Crc32CalculationFailed)?;
-
-        let size = METADATA_HDR_SIZE + sink.count;
-
-        Ok(TimelineMetadataHeader {
-            checksum: sink.crc,
-            size: size as u16,
-            format_version: METADATA_FORMAT_VERSION,
-        })
-    }
-}
-
-#[derive(thiserror::Error, Debug)]
-#[error("re-serializing for crc32 failed")]
-struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
-
 const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -182,12 +111,6 @@ impl TimelineMetadata {
        }
    }

-    #[cfg(test)]
-    pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
-        self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
-        Ok(self)
-    }
-
    fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
        let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;

@@ -338,93 +261,25 @@ impl TimelineMetadata {
    }
 }

-pub(crate) mod modern_serde {
-    use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
-    use serde::{Deserialize, Serialize};
-
-    pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
+impl<'de> Deserialize<'de> for TimelineMetadata {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
-        D: serde::de::Deserializer<'de>,
+        D: serde::Deserializer<'de>,
    {
-        // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec<u8> field with
-        // BeSer.
-        struct Visitor;
-
-        impl<'d> serde::de::Visitor<'d> for Visitor {
-            type Value = TimelineMetadata;
-
-            fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-                f.write_str("BeSer bytes or json structure")
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'d>,
-            {
-                use serde::de::Error;
-                let de = serde::de::value::SeqAccessDeserializer::new(seq);
-                Vec::<u8>::deserialize(de)
-                    .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))?
-            }
-
-            fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::MapAccess<'d>,
-            {
-                use serde::de::Error;
-
-                let de = serde::de::value::MapAccessDeserializer::new(map);
-                let body = TimelineMetadataBodyV2::deserialize(de)?;
-                let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;
-
-                Ok(TimelineMetadata { hdr, body })
-            }
-        }
-
-        deserializer.deserialize_any(Visitor)
+        let bytes = Vec::<u8>::deserialize(deserializer)?;
+        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
    }
+}

-    pub(crate) fn serialize<S>(
-        metadata: &TimelineMetadata,
-        serializer: S,
-    ) -> Result<S::Ok, S::Error>
+impl Serialize for TimelineMetadata {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
-        S: serde::Serializer,
+        S: Serializer,
    {
-        // header is not needed, upon reading we've upgraded all v1 to v2
-        metadata.body.serialize(serializer)
-    }
-
-    #[test]
-    fn deserializes_bytes_as_well_as_equivalent_body_v2() {
-        #[derive(serde::Deserialize, serde::Serialize)]
-        struct Wrapper(
-            #[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
-            TimelineMetadata,
-        );
-
-        let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
-
-        let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
-
-        let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();
-
-        assert_eq!(
-            serialized,
-            serde_json::json! {{
-                "disk_consistent_lsn": "0/149FD90",
-                "prev_record_lsn": "0/149FD18",
-                "ancestor_timeline": null,
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/149FD18",
-                "initdb_lsn": "0/149FD18",
-                "pg_version": 15
-            }}
-        );
-
-        let wrapper_from_json = serde_json::value::from_value::<Wrapper>(serialized).unwrap();
-
-        assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0);
+        let bytes = self
+            .to_bytes()
+            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
+        bytes.serialize(serializer)
    }
 }

@@ -548,6 +403,59 @@ mod tests {
        );
    }

+    #[test]
+    fn test_metadata_bincode_serde() {
+        let original_metadata = TimelineMetadata::new(
+            Lsn(0x200),
+            Some(Lsn(0x100)),
+            Some(TIMELINE_ID),
+            Lsn(0),
+            Lsn(0),
+            Lsn(0),
+            // Any version will do here, so use the default
+            crate::DEFAULT_PG_VERSION,
+        );
+        let metadata_bytes = original_metadata
+            .to_bytes()
+            .expect("Cannot create bytes array from metadata");
+
+        let metadata_bincode_be_bytes = original_metadata
+            .ser()
+            .expect("Cannot serialize the metadata");
+
+        // 8 bytes for the length of the vector
+        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
+
+        let expected_bincode_bytes = {
+            let mut temp = vec![];
+            let len_bytes = metadata_bytes.len().to_be_bytes();
+            temp.extend_from_slice(&len_bytes);
+            temp.extend_from_slice(&metadata_bytes);
+            temp
+        };
+        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
+
+        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
+        // Deserialized metadata has the metadata header, which is different from the serialized one.
+        //   Reference: TimelineMetaData::to_bytes()
+        let expected_metadata = {
+            let mut temp_metadata = original_metadata;
+            let body_bytes = temp_metadata
+                .body
+                .ser()
+                .expect("Cannot serialize the metadata body");
+            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
+            let hdr = TimelineMetadataHeader {
+                size: metadata_size as u16,
+                format_version: METADATA_FORMAT_VERSION,
+                checksum: crc32c::crc32c(&body_bytes),
+            };
+            temp_metadata.hdr = hdr;
+            temp_metadata
+        };
+        assert_eq!(deserialized_metadata, expected_metadata);
+    }
+
    #[test]
    fn test_metadata_bincode_serde_ensure_roundtrip() {
        let original_metadata = TimelineMetadata::new(
@@ -561,6 +469,8 @@ mod tests {
            crate::DEFAULT_PG_VERSION,
        );
        let expected_bytes = vec![
+            /* bincode length encoding bytes */
+            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
            /* TimelineMetadataHeader */
            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
@@ -590,7 +500,7 @@ mod tests {
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0,
        ];
-        let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
+        let metadata_ser_bytes = original_metadata.ser().unwrap();
        assert_eq!(metadata_ser_bytes, expected_bytes);

        let expected_metadata = {
@@ -608,7 +518,7 @@ mod tests {
            temp_metadata.hdr = hdr;
            temp_metadata
        };
-        let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
+        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
        assert_eq!(des_metadata, expected_metadata);
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,7 +3,6 @@

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
-use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -46,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -55,7 +54,6 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
-use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
 use super::TenantSharedResources;
@@ -1371,7 +1369,7 @@ impl TenantManager {
        &self,
        tenant_shard_id: TenantShardId,
        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
+    ) -> Result<(), DeleteTenantError> {
        super::span::debug_assert_current_span_has_tenant_id();
        // We acquire a SlotGuard during this function to protect against concurrent
        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
@@ -1384,79 +1382,18 @@ impl TenantManager {
        //
        // See https://github.com/neondatabase/neon/issues/5080

-        // Tenant deletion can happen two ways:
-        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
-        //   state until deletion is complete.
-        // - New: called on a pageserver without an attached location.  We proceed with deletion from
-        //   remote storage.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        match &slot_guard.old_value {
-            Some(TenantSlot::Attached(tenant)) => {
-                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
-                // deletion will be resumed across restarts.
-                let tenant = tenant.clone();
-                return self
-                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
-                    .await;
+        // unwrap is safe because we used MustExist mode when acquiring
+        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
+            TenantSlot::Attached(tenant) => tenant.clone(),
+            _ => {
+                // Express "not attached" as equivalent to "not found"
+                return Err(DeleteTenantError::NotAttached);
            }
-            Some(TenantSlot::Secondary(secondary_tenant)) => {
-                secondary_tenant.shutdown().await;
-                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
-                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
-                    .await
-                    .with_context(|| {
-                        format!("local tenant directory {local_tenant_directory:?} rename")
-                    })?;
-                spawn_background_purge(tmp_dir);
-            }
-            Some(TenantSlot::InProgress(_)) => unreachable!(),
-            None => {}
        };

-        // Fall through: local state for this tenant is no longer present, proceed with remote delete
-        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let keys = match self
-            .resources
-            .remote_storage
-            .list(
-                Some(&remote_path),
-                remote_storage::ListingMode::NoDelimiter,
-                None,
-                &self.cancel,
-            )
-            .await
-        {
-            Ok(listing) => listing.keys,
-            Err(remote_storage::DownloadError::Cancelled) => {
-                return Err(DeleteTenantError::Cancelled)
-            }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
-            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-        };
-
-        if keys.is_empty() {
-            tracing::info!("Remote storage already deleted");
-        } else {
-            tracing::info!("Deleting {} keys from remote storage", keys.len());
-            self.resources
-                .remote_storage
-                .delete_objects(&keys, &self.cancel)
-                .await?;
-        }
-
-        // Callers use 404 as success for deletions, for historical reasons.
-        Ok(StatusCode::NOT_FOUND)
-    }
-
-    async fn delete_tenant_attached(
-        &self,
-        slot_guard: SlotGuard,
-        tenant: Arc<Tenant>,
-        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
        match tenant.current_state() {
            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                // If deletion is already in progress, return success (the semantics of this
@@ -1466,7 +1403,7 @@ impl TenantManager {
                    // The `delete_progress` lock is held: deletion is already happening
                    // in the bacckground
                    slot_guard.revert();
-                    return Ok(StatusCode::ACCEPTED);
+                    return Ok(());
                }
            }
            _ => {
@@ -1499,8 +1436,7 @@ impl TenantManager {

        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
        slot_guard.revert();
-        let () = result?;
-        Ok(StatusCode::ACCEPTED)
+        result
    }

    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -2897,13 +2833,7 @@ pub(crate) async fn immediate_gc(
        }
    }

-    result.map_err(|e| match e {
-        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
-        GcError::TimelineNotFound => {
-            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
-        }
-        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-    })
+    result.map_err(ApiError::InternalServerError)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -91,7 +91,8 @@
 //!
 //! The *actual* remote state lags behind the *desired* remote state while
 //! there are in-flight operations.
-//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`].
+//! We keep track of the desired remote state in
+//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`].
 //! It is initialized based on the [`IndexPart`] that was passed during init
 //! and updated with every `schedule_*` function call.
 //! All this is necessary necessary to compute the future [`IndexPart`]s
@@ -114,7 +115,8 @@
 //!
 //! # Completion
 //!
-//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately,
+//! Once an operation has completed, we update
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
 //! and submit a request through the DeletionQueue to update
 //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
 //! validated that our generation is not stale.  It is this visible value
@@ -195,7 +197,6 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::pausable_failpoint;

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
@@ -414,7 +415,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
@@ -441,11 +441,13 @@ impl RemoteTimelineClient {
    /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
    /// client is currently initialized.
    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // technically this is a dirty read, but given how timeline detach ancestor is implemented
+        // via tenant restart, the lineage has always been uploaded.
        self.upload_queue
            .lock()
            .unwrap()
            .initialized_mut()
-            .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn))
+            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
            .unwrap_or(false)
    }

@@ -454,6 +456,7 @@ impl RemoteTimelineClient {
            current_remote_index_part
                .layer_metadata
                .values()
+                // If we don't have the file size for the layer, don't account for it in the metric.
                .map(|ilmd| ilmd.file_size)
                .sum()
        } else {
@@ -581,9 +584,9 @@ impl RemoteTimelineClient {

        // As documented in the struct definition, it's ok for latest_metadata to be
        // ahead of what's _actually_ on the remote during index upload.
-        upload_queue.dirty.metadata = metadata.clone();
+        upload_queue.latest_metadata = metadata.clone();

-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);

        Ok(())
    }
@@ -602,9 +605,9 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        upload_queue.dirty.metadata.apply(update);
+        upload_queue.latest_metadata.apply(update);

-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);

        Ok(())
    }
@@ -616,8 +619,8 @@ impl RemoteTimelineClient {
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
-        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue)?;
+        upload_queue.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue);
        Ok(())
    }
    ///
@@ -635,44 +638,30 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
        }

        Ok(())
    }

    /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-    ) -> anyhow::Result<()> {
-        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
-        // fix up the duplicated field
-        upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
-
-        // make sure it serializes before doing it in perform_upload_task so that it doesn't
-        // look like a retryable error
-        let void = std::io::sink();
-        serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
-
-        let index_part = &upload_queue.dirty;
+    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();

        info!(
            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
-            index_part.layer_metadata.len(),
+            upload_queue.latest_files.len(),
            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
        );

-        let op = UploadOp::UploadMetadata {
-            uploaded: Box::new(index_part.clone()),
-        };
+        let index_part = IndexPart::from(&*upload_queue);
+        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
-        Ok(())
    }

    pub(crate) async fn schedule_reparenting_and_wait(
@@ -685,16 +674,16 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else {
+            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
                return Err(anyhow::anyhow!(
                    "cannot reparent without a current ancestor"
                ));
            };

-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            upload_queue.latest_metadata.reparent(new_parent);
+            upload_queue.latest_lineage.record_previous_ancestor(&prev);

-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);

            self.schedule_barrier0(upload_queue)
        };
@@ -715,17 +704,16 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
+            upload_queue.latest_lineage.record_detaching(&adopted);

            for layer in layers {
                upload_queue
-                    .dirty
-                    .layer_metadata
+                    .latest_files
                    .insert(layer.layer_desc().layer_name(), layer.metadata());
            }

-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);

            let barrier = self.schedule_barrier0(upload_queue);
            self.launch_queued_tasks(upload_queue);
@@ -757,8 +745,7 @@ impl RemoteTimelineClient {
        let metadata = layer.metadata();

        upload_queue
-            .dirty
-            .layer_metadata
+            .latest_files
            .insert(layer.layer_desc().layer_name(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

@@ -788,8 +775,8 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let with_metadata = self
-            .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
+        let with_metadata =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());

        self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);

@@ -813,7 +800,7 @@ impl RemoteTimelineClient {

        let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());

-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);

        self.launch_queued_tasks(upload_queue);

@@ -826,7 +813,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
-    ) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
+    ) -> Vec<(LayerName, LayerFileMetadata)>
    where
        I: IntoIterator<Item = LayerName>,
    {
@@ -836,7 +823,7 @@ impl RemoteTimelineClient {
        let with_metadata: Vec<_> = names
            .into_iter()
            .filter_map(|name| {
-                let meta = upload_queue.dirty.layer_metadata.remove(&name);
+                let meta = upload_queue.latest_files.remove(&name);

                if let Some(meta) = meta {
                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -868,10 +855,10 @@ impl RemoteTimelineClient {
        // index_part update, because that needs to be uploaded before we can actually delete the
        // files.
        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
        }

-        Ok(with_metadata)
+        with_metadata
    }

    /// Schedules deletion for layer files which have previously been unlinked from the
@@ -962,7 +949,7 @@ impl RemoteTimelineClient {

        let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());

-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
        self.launch_queued_tasks(upload_queue);

        Ok(())
@@ -1097,7 +1084,7 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);

-            let mut index_part = stopped.upload_queue_for_deletion.dirty.clone();
+            let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
            index_part.deleted_at = Some(deleted_at);
            index_part
        };
@@ -1205,7 +1192,7 @@ impl RemoteTimelineClient {
                    &self.storage_impl,
                    uploaded.local_path(),
                    &remote_path,
-                    uploaded.metadata().file_size,
+                    uploaded.metadata().file_size(),
                    cancel,
                )
                .await
@@ -1308,8 +1295,7 @@ impl RemoteTimelineClient {

            stopped
                .upload_queue_for_deletion
-                .dirty
-                .layer_metadata
+                .latest_files
                .drain()
                .map(|(file_name, meta)| {
                    remote_layer_path(
@@ -1446,7 +1432,7 @@ impl RemoteTimelineClient {
                    // Can always be scheduled.
                    true
                }
-                UploadOp::UploadMetadata { .. } => {
+                UploadOp::UploadMetadata(_, _) => {
                    // These can only be performed after all the preceding operations
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
@@ -1488,7 +1474,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadLayer(_, _) => {
                    upload_queue.num_inprogress_layer_uploads += 1;
                }
-                UploadOp::UploadMetadata { .. } => {
+                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
                UploadOp::Delete(_) => {
@@ -1587,7 +1573,7 @@ impl RemoteTimelineClient {
                        &self.storage_impl,
                        local_path,
                        &remote_path,
-                        layer_metadata.file_size,
+                        layer_metadata.file_size(),
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1597,13 +1583,22 @@ impl RemoteTimelineClient {
                    )
                    .await
                }
-                UploadOp::UploadMetadata { ref uploaded } => {
+                UploadOp::UploadMetadata(ref index_part, _lsn) => {
+                    let mention_having_future_layers = if cfg!(feature = "testing") {
+                        index_part
+                            .layer_metadata
+                            .keys()
+                            .any(|x| x.is_in_future(*_lsn))
+                    } else {
+                        false
+                    };
+
                    let res = upload::upload_index_part(
                        &self.storage_impl,
                        &self.tenant_shard_id,
                        &self.timeline_id,
                        self.generation,
-                        uploaded,
+                        index_part,
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1613,21 +1608,10 @@ impl RemoteTimelineClient {
                    )
                    .await;
                    if res.is_ok() {
-                        self.update_remote_physical_size_gauge(Some(uploaded));
-                        let mention_having_future_layers = if cfg!(feature = "testing") {
-                            uploaded
-                                .layer_metadata
-                                .keys()
-                                .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn()))
-                        } else {
-                            false
-                        };
+                        self.update_remote_physical_size_gauge(Some(index_part));
                        if mention_having_future_layers {
                            // find rationale near crate::tenant::timeline::init::cleanup_future_layer
-                            tracing::info!(
-                                disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(),
-                                "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"
-                            );
+                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
                        }
                    }
                    res
@@ -1728,23 +1712,11 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_layer_uploads -= 1;
                    None
                }
-                UploadOp::UploadMetadata { ref uploaded } => {
+                UploadOp::UploadMetadata(_, lsn) => {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
+                    // XXX monotonicity check?

-                    // the task id is reused as a monotonicity check for storing the "clean"
-                    // IndexPart.
-                    let last_updater = upload_queue.clean.1;
-                    let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
-                    let monotone = is_later || last_updater.is_none();
-
-                    assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
-
-                    // not taking ownership is wasteful
-                    upload_queue.clean.0.clone_from(uploaded);
-                    upload_queue.clean.1 = Some(task.task_id);
-
-                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
-
+                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -1796,9 +1768,9 @@ impl RemoteTimelineClient {
            UploadOp::UploadLayer(_, m) => (
                RemoteOpFileKind::Layer,
                RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
            ),
-            UploadOp::UploadMetadata { .. } => (
+            UploadOp::UploadMetadata(_, _) => (
                RemoteOpFileKind::Index,
                RemoteOpKind::Upload,
                DontTrackSize {
@@ -1874,9 +1846,11 @@ impl RemoteTimelineClient {
                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                    let upload_queue_for_deletion = UploadQueueInitialized {
                        task_counter: 0,
-                        dirty: initialized.dirty.clone(),
-                        clean: initialized.clean.clone(),
+                        latest_files: initialized.latest_files.clone(),
                        latest_files_changes_since_metadata_upload_scheduled: 0,
+                        latest_metadata: initialized.latest_metadata.clone(),
+                        latest_lineage: initialized.latest_lineage.clone(),
+                        projected_remote_consistent_lsn: None,
                        visible_remote_consistent_lsn: initialized
                            .visible_remote_consistent_lsn
                            .clone(),
@@ -1889,6 +1863,7 @@ impl RemoteTimelineClient {
                        dangling_files: HashMap::default(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        last_aux_file_policy: initialized.last_aux_file_policy,
                    };

                    let upload_queue = std::mem::replace(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -28,7 +28,6 @@ use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
-use utils::pausable_failpoint;

 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
@@ -85,7 +84,7 @@ pub async fn download_layer_file<'a>(
    )
    .await?;

-    let expected = layer_metadata.file_size;
+    let expected = layer_metadata.file_size();
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
@@ -153,8 +152,6 @@ async fn download_object<'a>(

                let download = storage.download(src_path, cancel).await?;

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
                let mut buf_writer =
                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);

@@ -202,8 +199,6 @@ async fn download_object<'a>(

                let mut download = storage.download(src_path, cancel).await?;

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                // There's chunks_vectored() on the stream.
                let (bytes_amount, destination_file) = async {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -11,11 +11,52 @@ use utils::id::TimelineId;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;

 use utils::lsn::Lsn;

+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+//#[cfg_attr(test, derive(Default))]
+pub struct LayerFileMetadata {
+    file_size: u64,
+
+    pub(crate) generation: Generation,
+
+    pub(crate) shard: ShardIndex,
+}
+
+impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
+    fn from(other: &IndexLayerMetadata) -> Self {
+        LayerFileMetadata {
+            file_size: other.file_size,
+            generation: other.generation,
+            shard: other.shard,
+        }
+    }
+}
+
+impl LayerFileMetadata {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+            shard,
+        }
+    }
+
+    pub fn file_size(&self) -> u64 {
+        self.file_size
+    }
+}
+
+// TODO seems like another part of the remote storage file format
+// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -36,21 +77,14 @@ pub struct IndexPart {
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
-    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
+    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,

-    /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
-    /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
-    /// reused.
-    pub(super) disk_consistent_lsn: Lsn,
+    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
+    // It's duplicated for convenience when reading the serialized structure, but is
+    // private because internally we would read from metadata instead.
+    disk_consistent_lsn: Lsn,

-    // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
-    // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
-    // for backwards compatibility.
-    #[serde(
-        rename = "metadata_bytes",
-        alias = "metadata",
-        with = "crate::tenant::metadata::modern_serde"
-    )]
+    #[serde(rename = "metadata_bytes")]
    pub metadata: TimelineMetadata,

    #[serde(default)]
@@ -79,33 +113,43 @@ impl IndexPart {
    /// - 4: timeline_layers is fully removed.
    /// - 5: lineage was added
    /// - 6: last_aux_file_policy is added.
-    /// - 7: metadata_bytes is no longer written, but still read
-    const LATEST_VERSION: usize = 7;
+    const LATEST_VERSION: usize = 6;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];

    pub const FILE_NAME: &'static str = "index_part.json";

-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
-        IndexPart {
+    fn new(
+        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
+        disk_consistent_lsn: Lsn,
+        metadata: TimelineMetadata,
+        lineage: Lineage,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> Self {
+        let layer_metadata = layers_and_metadata
+            .iter()
+            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
+            .collect();
+
+        Self {
            version: Self::LATEST_VERSION,
-            layer_metadata: Default::default(),
-            disk_consistent_lsn: metadata.disk_consistent_lsn(),
+            layer_metadata,
+            disk_consistent_lsn,
            metadata,
            deleted_at: None,
-            lineage: Default::default(),
-            last_aux_file_policy: None,
+            lineage,
+            last_aux_file_policy,
        }
    }

-    pub fn version(&self) -> usize {
+    pub fn get_version(&self) -> usize {
        self.version
    }

    /// If you want this under normal operations, read it from self.metadata:
    /// this method is just for the scrubber to use when validating an index.
-    pub fn duplicated_disk_consistent_lsn(&self) -> Lsn {
+    pub fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn
    }

@@ -119,7 +163,14 @@ impl IndexPart {

    #[cfg(test)]
    pub(crate) fn example() -> Self {
-        Self::empty(TimelineMetadata::example())
+        let example_metadata = TimelineMetadata::example();
+        Self::new(
+            &HashMap::new(),
+            example_metadata.disk_consistent_lsn(),
+            example_metadata,
+            Default::default(),
+            Some(AuxFilePolicy::V1),
+        )
    }

    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
@@ -127,12 +178,25 @@ impl IndexPart {
    }
 }

-/// Metadata gathered for each of the layer files.
-///
-/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
-/// might have less or more metadata depending if upgrading or rolling back an upgrade.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
-pub struct LayerFileMetadata {
+impl From<&UploadQueueInitialized> for IndexPart {
+    fn from(uq: &UploadQueueInitialized) -> Self {
+        let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
+        let metadata = uq.latest_metadata.clone();
+        let lineage = uq.latest_lineage.clone();
+
+        Self::new(
+            &uq.latest_files,
+            disk_consistent_lsn,
+            metadata,
+            lineage,
+            uq.last_aux_file_policy,
+        )
+    }
+}
+
+/// Serialized form of [`LayerFileMetadata`].
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct IndexLayerMetadata {
    pub file_size: u64,

    #[serde(default = "Generation::none")]
@@ -144,12 +208,12 @@ pub struct LayerFileMetadata {
    pub shard: ShardIndex,
 }

-impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-            shard,
+impl From<&LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &LayerFileMetadata) -> Self {
+        IndexLayerMetadata {
+            file_size: other.file_size,
+            generation: other.generation,
+            shard: other.shard,
        }
    }
 }
@@ -212,18 +276,19 @@ impl Lineage {
    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
    /// to start a read/write primary at this lsn".
    ///
-    /// Returns true if the Lsn was previously our branch point.
+    /// Returns true if the Lsn was previously a branch point.
    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
        self.original_ancestor
-            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
+            .as_ref()
+            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
    }
 }

 #[cfg(test)]
 mod tests {
-    use super::*;
    use std::str::FromStr;
-    use utils::id::TimelineId;
+
+    use super::*;

    #[test]
    fn v1_indexpart_is_parsed() {
@@ -242,12 +307,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -284,12 +349,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -327,12 +392,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -342,7 +407,8 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
            lineage: Lineage::default(),
            last_aux_file_policy: None,
        };
@@ -414,12 +480,12 @@ mod tests {
        let expected = IndexPart {
            version: 4,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -456,12 +522,12 @@ mod tests {
        let expected = IndexPart {
            version: 5,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
                    file_size: 23289856,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
                    file_size: 1015808,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
@@ -503,12 +569,12 @@ mod tests {
        let expected = IndexPart {
            version: 6,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -518,7 +584,8 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
            lineage: Lineage {
                reparenting_history_truncated: false,
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -531,60 +598,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v7_indexpart_is_parsed() {
-        let example = r#"{
-            "version": 7,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata": {
-                "disk_consistent_lsn": "0/16960E8",
-                "prev_record_lsn": "0/1696070",
-                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/1696070",
-                "initdb_lsn": "0/1696070",
-                "pg_version": 14
-            },
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }"#;
-
-        let expected = IndexPart {
-            version: 7,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::new(
-                Lsn::from_str("0/16960E8").unwrap(),
-                Some(Lsn::from_str("0/1696070").unwrap()),
-                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
-                Lsn::INVALID,
-                Lsn::from_str("0/1696070").unwrap(),
-                Lsn::from_str("0/1696070").unwrap(),
-                14,
-            ).with_recalculated_checksum().unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            lineage: Default::default(),
-            last_aux_file_policy: Default::default(),
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,7 +1,6 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use anyhow::{bail, Context};
-use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
@@ -10,12 +9,12 @@ use std::time::SystemTime;
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
-use utils::{backoff, pausable_failpoint};
+use utils::backoff;

-use super::index::IndexPart;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
-    remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
+    index::IndexPart, remote_index_path, remote_initdb_archive_path,
+    remote_initdb_preserved_archive_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -28,7 +27,7 @@ pub(crate) async fn upload_index_part<'a>(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    generation: Generation,
-    index_part: &IndexPart,
+    index_part: &'a IndexPart,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading new index part");
@@ -38,16 +37,16 @@ pub(crate) async fn upload_index_part<'a>(
    });
    pausable_failpoint!("before-upload-index-pausable");

-    // FIXME: this error comes too late
-    let serialized = index_part.to_s3_bytes()?;
-    let serialized = Bytes::from(serialized);
-
-    let index_part_size = serialized.len();
+    let index_part_bytes = index_part
+        .to_s3_bytes()
+        .context("serialize index part file into bytes")?;
+    let index_part_size = index_part_bytes.len();
+    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
    storage
        .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(serialized))),
+            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
            cancel,
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -187,7 +187,6 @@ impl SecondaryTenant {
        };

        let now = SystemTime::now();
-        tracing::info!("Evicting secondary layer");

        let this = self.clone();

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{

 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
+use futures::{Future, StreamExt};
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};

 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -67,6 +67,12 @@ use super::{
 /// download, if the uploader populated it.
 const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);

+/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
+/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
+/// `PageServerConf::secondary_download_concurrency`
+const MAX_LAYER_CONCURRENCY: usize = 16;
+const MIN_LAYER_CONCURRENCY: usize = 1;
+
 pub(super) async fn downloader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
@@ -75,14 +81,15 @@ pub(super) async fn downloader_task(
    cancel: CancellationToken,
    root_ctx: RequestContext,
 ) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    // How many tenants' secondary download operations we will run concurrently
+    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;

    let generator = SecondaryDownloader {
        tenant_manager,
        remote_storage,
        root_ctx,
    };
-    let mut scheduler = Scheduler::new(generator, concurrency);
+    let mut scheduler = Scheduler::new(generator, tenant_concurrency);

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
@@ -407,7 +414,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
                }
                Err(UpdateError::Cancelled) => {
-                    tracing::info!("Shut down while downloading");
+                    tracing::debug!("Shut down while downloading");
                },
                Err(UpdateError::Deserialize(e)) => {
                    tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -513,7 +520,7 @@ impl<'a> TenantDownloader<'a> {
        // cover our access to local storage.
        let Ok(_guard) = self.secondary_state.gate.enter() else {
            // Shutting down
-            return Err(UpdateError::Cancelled);
+            return Ok(());
        };

        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
@@ -709,7 +716,7 @@ impl<'a> TenantDownloader<'a> {
                let mut layer_byte_count: u64 = timeline_state
                    .on_disk_layers
                    .values()
-                    .map(|l| l.metadata.file_size)
+                    .map(|l| l.metadata.file_size())
                    .sum();

                // Remove on-disk layers that are no longer present in heatmap
@@ -720,7 +727,7 @@ impl<'a> TenantDownloader<'a> {
                        .get(layer_file_name)
                        .unwrap()
                        .metadata
-                        .file_size;
+                        .file_size();

                    let local_path = local_layer_path(
                        self.conf,
@@ -841,12 +848,14 @@ impl<'a> TenantDownloader<'a> {

        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());

+        let mut download_futs = Vec::new();
+
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
-                return Err(UpdateError::Cancelled);
+                return Ok(());
            }

            // Existing on-disk layers: just update their access time.
@@ -877,7 +886,9 @@ impl<'a> TenantDownloader<'a> {
                    }
                }

-                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
+                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
                    // We already have this layer on disk.  Update its access time.
                    tracing::debug!(
                        "Access time updated for layer {}: {} -> {}",
@@ -909,19 +920,35 @@ impl<'a> TenantDownloader<'a> {
                        strftime(&layer.access_time),
                        strftime(evicted_at)
                    );
-                    self.skip_layer(layer);
                    continue;
                }
            }

-            match self
-                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
-                .await?
-            {
-                Some(layer) => touched.push(layer),
-                None => {
-                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
-                    // things to consider touched.
+            download_futs.push(self.download_layer(
+                tenant_shard_id,
+                &timeline.timeline_id,
+                layer,
+                ctx,
+            ));
+        }
+
+        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
+        // concurrency to use based on activity level of remote storage.
+        while !download_futs.is_empty() {
+            let chunk =
+                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
+
+            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
+
+            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
+            let mut result_stream = std::pin::pin!(result_stream);
+            while let Some(result) = result_stream.next().await {
+                match result {
+                    Err(e) => return Err(e),
+                    Ok(None) => {
+                        // No error, but we didn't download the layer.  Don't mark it touched
+                    }
+                    Ok(Some(layer)) => touched.push(layer),
                }
            }
        }
@@ -952,7 +979,7 @@ impl<'a> TenantDownloader<'a> {
                            tenant_shard_id,
                            &timeline.timeline_id,
                            t.name,
-                            t.metadata.clone(),
+                            LayerFileMetadata::from(&t.metadata),
                            t.access_time,
                            local_path,
                        ));
@@ -964,15 +991,6 @@ impl<'a> TenantDownloader<'a> {
        Ok(())
    }

-    /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
-    fn skip_layer(&self, layer: HeatMapLayer) {
-        let mut progress = self.secondary_state.progress.lock().unwrap();
-        progress.layers_total = progress.layers_total.saturating_sub(1);
-        progress.bytes_total = progress
-            .bytes_total
-            .saturating_sub(layer.metadata.file_size);
-    }
-
    async fn download_layer(
        &self,
        tenant_shard_id: &TenantShardId,
@@ -1000,20 +1018,19 @@ impl<'a> TenantDownloader<'a> {
            layer.name,
            layer.metadata.file_size
        );
-        let downloaded_bytes = download_layer_file(
+        let downloaded_bytes = match download_layer_file(
            self.conf,
            self.remote_storage,
            *tenant_shard_id,
            *timeline_id,
            &layer.name,
-            &layer.metadata,
+            &LayerFileMetadata::from(&layer.metadata),
            &local_path,
            &self.secondary_state.cancel,
            ctx,
        )
-        .await;
-
-        let downloaded_bytes = match downloaded_bytes {
+        .await
+        {
            Ok(bytes) => bytes,
            Err(DownloadError::NotFound) => {
                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
@@ -1023,7 +1040,13 @@ impl<'a> TenantDownloader<'a> {
                    "Skipped downloading missing layer {}, raced with compaction/gc?",
                    layer.name
                );
-                self.skip_layer(layer);
+
+                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.layers_total = progress.layers_total.saturating_sub(1);
+                progress.bytes_total = progress
+                    .bytes_total
+                    .saturating_sub(layer.metadata.file_size);

                return Ok(None);
            }
@@ -1060,6 +1083,19 @@ impl<'a> TenantDownloader<'a> {

        Ok(Some(layer))
    }
+
+    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
+    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
+        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
+        // of our concurrency range to the units available within the remaining 25%.
+        let clamp_at = (activity.read_total * 3) / 4;
+        if activity.read_available > clamp_at {
+            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
+                / (activity.read_total - clamp_at)
+        } else {
+            MIN_LAYER_CONCURRENCY
+        }
+    }
 }

 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1149,7 +1185,7 @@ async fn init_timeline_state(
                                    tenant_shard_id,
                                    &heatmap.timeline_id,
                                    name,
-                                    remote_meta.metadata.clone(),
+                                    LayerFileMetadata::from(&remote_meta.metadata),
                                    remote_meta.access_time,
                                    file_path,
                                ),
@@ -1183,3 +1219,58 @@ async fn init_timeline_state(

    detail
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn layer_concurrency() {
+        // Totally idle
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 16,
+                read_total: 16,
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY
+        );
+
+        // Totally busy
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 0,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Edge of the range at which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 12,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Midpoint of the range in which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 14,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY / 2
+        );
+    }
+}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,6 +1,6 @@
 use std::time::SystemTime;

-use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
+use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};

 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
    pub(super) name: LayerName,
-    pub(super) metadata: LayerFileMetadata,
+    pub(super) metadata: IndexLayerMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
 impl HeatMapLayer {
    pub(crate) fn new(
        name: LayerName,
-        metadata: LayerFileMetadata,
+        metadata: IndexLayerMetadata,
        access_time: SystemTime,
    ) -> Self {
        Self {
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -334,11 +334,8 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!(
-                tenant_id=%tenant_shard_id.tenant_id,
-                shard_id=%tenant_shard_id.shard_slug(),
-                "Command already running, waiting for it"
-            );
+            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                           "Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

+use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -10,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use super::{LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -42,44 +43,6 @@ pub struct SegmentMeta {
    pub kind: LsnKind,
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum CalculateSyntheticSizeError {
-    /// Something went wrong internally to the calculation of logical size at a particular branch point
-    #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
-    LogicalSize {
-        timeline_id: TimelineId,
-        lsn: Lsn,
-        error: CalculateLogicalSizeError,
-    },
-
-    /// Something went wrong internally when calculating GC parameters at start of size calculation
-    #[error(transparent)]
-    GcInfo(GcError),
-
-    /// Totally unexpected errors, like panics joining a task
-    #[error(transparent)]
-    Fatal(anyhow::Error),
-
-    /// The LSN we are trying to calculate a size at no longer exists at the point we query it
-    #[error("Could not find size at {lsn} in timeline {timeline_id}")]
-    LsnNotFound { timeline_id: TimelineId, lsn: Lsn },
-
-    /// Tenant shut down while calculating size
-    #[error("Cancelled")]
-    Cancelled,
-}
-
-impl From<GcError> for CalculateSyntheticSizeError {
-    fn from(value: GcError) -> Self {
-        match value {
-            GcError::TenantCancelled | GcError::TimelineCancelled => {
-                CalculateSyntheticSizeError::Cancelled
-            }
-            other => CalculateSyntheticSizeError::GcInfo(other),
-        }
-    }
-}
-
 impl SegmentMeta {
    fn size_needed(&self) -> bool {
        match self.kind {
@@ -153,9 +116,12 @@ pub(super) async fn gather_inputs(
    cause: LogicalSizeCalculationCause,
    cancel: &CancellationToken,
    ctx: &RequestContext,
-) -> Result<ModelInputs, CalculateSyntheticSizeError> {
+) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    tenant.refresh_gc_info(cancel, ctx).await?;
+    tenant
+        .refresh_gc_info(cancel, ctx)
+        .await
+        .context("Failed to refresh gc_info before gathering inputs")?;

    // Collect information about all the timelines
    let mut timelines = tenant.list_timelines();
@@ -361,12 +327,6 @@ pub(super) async fn gather_inputs(
    )
    .await?;

-    if tenant.cancel.is_cancelled() {
-        // If we're shutting down, return an error rather than a sparse result that might include some
-        // timelines from before we started shutting down
-        return Err(CalculateSyntheticSizeError::Cancelled);
-    }
-
    Ok(ModelInputs {
        segments,
        timeline_inputs,
@@ -385,7 +345,7 @@ async fn fill_logical_sizes(
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
-) -> Result<(), CalculateSyntheticSizeError> {
+) -> anyhow::Result<()> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
        timelines
            .iter()
@@ -427,7 +387,7 @@ async fn fill_logical_sizes(
    }

    // Perform the size lookups
-    let mut have_any_error = None;
+    let mut have_any_error = false;
    while let Some(res) = joinset.join_next().await {
        // each of these come with Result<anyhow::Result<_>, JoinError>
        // because of spawn + spawn_blocking
@@ -438,36 +398,21 @@ async fn fill_logical_sizes(
            Err(join_error) => {
                // cannot really do anything, as this panic is likely a bug
                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
-
-                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
-                    anyhow::anyhow!(join_error)
-                        .context("task that calls spawn_ondemand_logical_size_calculation"),
-                ));
+                have_any_error = true;
            }
            Ok(Err(recv_result_error)) => {
                // cannot really do anything, as this panic is likely a bug
                error!("failed to receive logical size query result: {recv_result_error:#}");
-                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
-                    anyhow::anyhow!(recv_result_error)
-                        .context("Receiving logical size query result"),
-                ));
+                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if matches!(error, CalculateLogicalSizeError::Cancelled) {
-                    // Skip this: it's okay if one timeline among many is shutting down while we
-                    // calculate inputs for the overall tenant.
-                    continue;
-                } else {
+                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
                    warn!(
                        timeline_id=%timeline.timeline_id,
                        "failed to calculate logical size at {lsn}: {error:#}"
                    );
-                    have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
-                        timeline_id: timeline.timeline_id,
-                        lsn,
-                        error,
-                    });
                }
+                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
@@ -481,10 +426,10 @@ async fn fill_logical_sizes(
    // prune any keys not needed anymore; we record every used key and added key.
    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));

-    if let Some(error) = have_any_error {
+    if have_any_error {
        // we cannot complete this round, because we are missing data.
        // we have however cached all we were able to request calculation on.
-        return Err(error);
+        anyhow::bail!("failed to calculate some logical_sizes");
    }

    // Insert the looked up sizes to the Segments
@@ -499,29 +444,32 @@ async fn fill_logical_sizes(
        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
            seg.segment.size = Some(*size);
        } else {
-            return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn });
+            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
        }
    }
    Ok(())
 }

 impl ModelInputs {
-    pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
+    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
        // Convert SegmentMetas into plain Segments
-        StorageModel {
+        let storage = StorageModel {
            segments: self
                .segments
                .iter()
                .map(|seg| seg.segment.clone())
                .collect(),
-        }
+        };
+
+        Ok(storage)
    }

    // calculate total project size
-    pub fn calculate(&self) -> u64 {
-        let storage = self.calculate_model();
+    pub fn calculate(&self) -> anyhow::Result<u64> {
+        let storage = self.calculate_model()?;
        let sizes = storage.calculate();
-        sizes.total_size
+
+        Ok(sizes.total_size)
    }
 }

@@ -708,7 +656,7 @@ fn verify_size_for_multiple_branches() {
 "#;
    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();

-    assert_eq!(inputs.calculate(), 37_851_408);
+    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
 }

 #[test]
@@ -763,7 +711,7 @@ fn verify_size_for_one_branch() {

    let model: ModelInputs = serde_json::from_str(doc).unwrap();

-    let res = model.calculate_model().calculate();
+    let res = model.calculate_model().unwrap().calculate();

    println!("calculated synthetic size: {}", res.total_size);
    println!("result: {:?}", serde_json::to_string(&res.segments));
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: Vec<KeySpace>,
+    target_keyspace: KeySpace,
 }

 impl LayerFringe {
@@ -336,7 +336,6 @@ impl LayerFringe {
        };

        let removed = self.layers.remove_entry(&read_desc.layer_id);
-
        match removed {
            Some((
                _,
@@ -344,15 +343,7 @@ impl LayerFringe {
                    layer,
                    target_keyspace,
                },
-            )) => {
-                let mut keyspace = KeySpaceRandomAccum::new();
-                for ks in target_keyspace {
-                    for part in ks.ranges {
-                        keyspace.add_range(part);
-                    }
-                }
-                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
-            }
+            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }
@@ -367,7 +358,7 @@ impl LayerFringe {
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.push(keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
            }
            Entry::Vacant(entry) => {
                self.planned_reads_by_lsn.push(ReadDesc {
@@ -376,7 +367,7 @@ impl LayerFringe {
                });
                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: vec![keyspace],
+                    target_keyspace: keyspace,
                });
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -219,6 +219,7 @@ pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
+    lsn_range: Range<Lsn>,

    file: VirtualFile,
    file_id: FileId,
@@ -477,23 +478,6 @@ impl DeltaLayerWriterInner {
        key_end: Key,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
-        if result.is_err() {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
-            }
-        }
-        result
-    }
-
-    async fn finish0(
-        self,
-        key_end: Key,
-        timeline: &Arc<Timeline>,
-        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -667,11 +651,19 @@ impl DeltaLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+        let inner = self.inner.take().unwrap();
+        let temp_path = inner.path.clone();
+        let result = inner.finish(key_end, timeline, ctx).await;
+        // The delta layer files can sometimes be really large. Clean them up.
+        if result.is_err() {
+            tracing::warn!(
+                "Cleaning up temporary delta file {temp_path} after error during writing"
+            );
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
+            }
+        }
+        result
    }
 }

@@ -784,6 +776,7 @@ impl DeltaLayerInner {
            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
+            lsn_range: actual_summary.lsn_range,
            max_vectored_read_bytes,
        }))
    }
@@ -909,7 +902,7 @@ impl DeltaLayerInner {

        let reads = Self::plan_reads(
            &keyspace,
-            lsn_range.clone(),
+            lsn_range,
            data_end_offset,
            index_reader,
            planner,
@@ -922,50 +915,11 @@ impl DeltaLayerInner {
        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

-        reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
+        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);

        Ok(())
    }

-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
-        let mut result = Vec::new();
-        let mut stream =
-            Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        while let Some(item) = stream.next().await {
-            let (key, lsn, pos) = item?;
-            // TODO: dedup code with get_reconstruct_value
-            // TODO: ctx handling and sharding
-            cursor
-                .read_blob_into_buf(pos.pos(), &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            result.push((key, lsn, val));
-        }
-        Ok(result)
-    }
-
    async fn plan_reads<Reader>(
        keyspace: &KeySpace,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -47,7 +47,7 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
-use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -473,7 +473,7 @@ impl ImageLayerInner {
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        let reads = self
-            .plan_reads(keyspace, None, ctx)
+            .plan_reads(keyspace, ctx)
            .await
            .map_err(GetVectoredError::Other)?;

@@ -485,43 +485,9 @@ impl ImageLayerInner {
        Ok(())
    }

-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-        let mut result = Vec::new();
-        let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        while let Some(item) = stream.next().await {
-            // TODO: dedup code with get_reconstruct_value
-            let (raw_key, offset) = item?;
-            let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-            // TODO: ctx handling and sharding
-            let blob = cursor
-                .read_blob(offset, ctx)
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-            result.push((key, self.lsn, Value::Image(value)));
-        }
-        Ok(result)
-    }
-
-    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
-    /// and the keys in this layer.
-    ///
-    /// If shard_identity is provided, it will be used to filter keys down to those stored on
-    /// this shard.
    async fn plan_reads(
        &self,
        keyspace: KeySpace,
-        shard_identity: Option<&ShardIdentity>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<VectoredRead>> {
        let mut planner = VectoredReadPlanner::new(
@@ -541,6 +507,7 @@ impl ImageLayerInner {

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;
+
            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

@@ -553,22 +520,12 @@ impl ImageLayerInner {
                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                assert!(key >= range.start);

-                let flag = if let Some(shard_identity) = shard_identity {
-                    if shard_identity.is_key_disposable(&key) {
-                        BlobFlag::Ignore
-                    } else {
-                        BlobFlag::None
-                    }
-                } else {
-                    BlobFlag::None
-                };
-
                if key >= range.end {
                    planner.handle_range_end(offset);
                    range_end_handled = true;
                    break;
                } else {
-                    planner.handle(key, self.lsn, offset, flag);
+                    planner.handle(key, self.lsn, offset, BlobFlag::None);
                }
            }

@@ -581,50 +538,6 @@ impl ImageLayerInner {
        Ok(planner.finish())
    }

-    /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
-    /// then execute vectored GET operations, passing the results of all read keys into the writer.
-    pub(super) async fn filter(
-        &self,
-        shard_identity: &ShardIdentity,
-        writer: &mut ImageLayerWriter,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<usize> {
-        // Fragment the range into the regions owned by this ShardIdentity
-        let plan = self
-            .plan_reads(
-                KeySpace {
-                    // If asked for the total key space, plan_reads will give us all the keys in the layer
-                    ranges: vec![Key::MIN..Key::MAX],
-                },
-                Some(shard_identity),
-                ctx,
-            )
-            .await?;
-
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
-        let mut key_count = 0;
-        for read in plan.into_iter() {
-            let buf_size = read.size();
-
-            let buf = BytesMut::with_capacity(buf_size);
-            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
-            let frozen_buf = blobs_buf.buf.freeze();
-
-            for meta in blobs_buf.blobs.iter() {
-                let img_buf = frozen_buf.slice(meta.start..meta.end);
-
-                key_count += 1;
-                writer
-                    .put_image(meta.meta.key, img_buf, ctx)
-                    .await
-                    .context(format!("Storing key {}", meta.meta.key))?;
-            }
-        }
-
-        Ok(key_count)
-    }
-
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -942,196 +855,3 @@ impl Drop for ImageLayerWriter {
        }
    }
 }
-
-#[cfg(test)]
-mod test {
-    use std::time::Duration;
-
-    use bytes::Bytes;
-    use pageserver_api::{
-        key::Key,
-        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
-    };
-    use utils::{
-        generation::Generation,
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::{
-        tenant::{config::TenantConf, harness::TenantHarness},
-        DEFAULT_PG_VERSION,
-    };
-
-    use super::ImageLayerWriter;
-
-    #[tokio::test]
-    async fn image_layer_rewrite() {
-        let tenant_conf = TenantConf {
-            gc_period: Duration::ZERO,
-            compaction_period: Duration::ZERO,
-            ..TenantConf::default()
-        };
-        let tenant_id = TenantId::generate();
-        let mut gen = Generation::new(0xdead0001);
-        let mut get_next_gen = || {
-            let ret = gen;
-            gen = gen.next();
-            ret
-        };
-        // The LSN at which we will create an image layer to filter
-        let lsn = Lsn(0xdeadbeef0000);
-        let timeline_id = TimelineId::generate();
-
-        //
-        // Create an unsharded parent with a layer.
-        //
-
-        let harness = TenantHarness::create_custom(
-            "test_image_layer_rewrite--parent",
-            tenant_conf.clone(),
-            tenant_id,
-            ShardIdentity::unsharded(),
-            get_next_gen(),
-        )
-        .unwrap();
-        let (tenant, ctx) = harness.load().await;
-        let timeline = tenant
-            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
-        let range = input_start..input_end;
-
-        // Build an image layer to filter
-        let resident = {
-            let mut writer = ImageLayerWriter::new(
-                harness.conf,
-                timeline_id,
-                harness.tenant_shard_id,
-                &range,
-                lsn,
-                &ctx,
-            )
-            .await
-            .unwrap();
-
-            let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
-            let mut key = range.start;
-            while key < range.end {
-                writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
-
-                key = key.next();
-            }
-            writer.finish(&timeline, &ctx).await.unwrap()
-        };
-        let original_size = resident.metadata().file_size;
-
-        //
-        // Create child shards and do the rewrite, exercising filter().
-        // TODO: abstraction in TenantHarness for splits.
-        //
-
-        // Filter for various shards: this exercises cases like values at start of key range, end of key
-        // range, middle of key range.
-        let shard_count = ShardCount::new(4);
-        for shard_number in 0..shard_count.count() {
-            //
-            // mimic the shard split
-            //
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                shard_count,
-                ShardStripeSize(0x8000),
-            )
-            .unwrap();
-            let harness = TenantHarness::create_custom(
-                Box::leak(Box::new(format!(
-                    "test_image_layer_rewrite--child{}",
-                    shard_identity.shard_slug()
-                ))),
-                tenant_conf.clone(),
-                tenant_id,
-                shard_identity,
-                // NB: in reality, the shards would each fork off their own gen number sequence from the parent.
-                // But here, all we care about is that the gen number is unique.
-                get_next_gen(),
-            )
-            .unwrap();
-            let (tenant, ctx) = harness.load().await;
-            let timeline = tenant
-                .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
-                .await
-                .unwrap();
-
-            //
-            // use filter() and make assertions
-            //
-
-            let mut filtered_writer = ImageLayerWriter::new(
-                harness.conf,
-                timeline_id,
-                harness.tenant_shard_id,
-                &range,
-                lsn,
-                &ctx,
-            )
-            .await
-            .unwrap();
-
-            let wrote_keys = resident
-                .filter(&shard_identity, &mut filtered_writer, &ctx)
-                .await
-                .unwrap();
-            let replacement = if wrote_keys > 0 {
-                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
-            } else {
-                None
-            };
-
-            // This exact size and those below will need updating as/when the layer encoding changes, but
-            // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
-
-            match shard_number {
-                0 => {
-                    // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
-                    let replacement = replacement.unwrap();
-
-                    // We should have dropped some of the data
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-
-                    // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
-                }
-                1 => {
-                    // Shard 1 has no keys in our input range
-                    assert_eq!(wrote_keys, 0x0);
-                    assert!(replacement.is_none());
-                }
-                2 => {
-                    // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
-                    let replacement = replacement.unwrap();
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
-                }
-                3 => {
-                    // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
-                    let replacement = replacement.unwrap();
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
-                }
-                _ => unreachable!(),
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -52,7 +52,7 @@ pub struct InMemoryLayer {

    /// Frozen layers have an exclusive end LSN.
    /// Writes are only allowed when this is `None`.
-    pub(crate) end_lsn: OnceLock<Lsn>,
+    end_lsn: OnceLock<Lsn>,

    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
    local_path_str: Arc<str>,
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
-use super::image_layer::{self};
+use super::image_layer;
 use super::{
-    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
+    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -161,7 +161,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size,
+            metadata.file_size(),
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size,
+            metadata.file_size(),
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {

        timeline
            .metrics
-            .resident_physical_size_add(metadata.file_size);
+            .resident_physical_size_add(metadata.file_size());

        ResidentLayer { downloaded, owner }
    }
@@ -277,10 +277,9 @@ impl Layer {

        let downloaded = resident.expect("just initialized");

-        // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
-        // TODO: this leaves the temp file in place if the rename fails, risking us running
-        // out of space. Should we clean it up here or does the calling context deal with this?
-        utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
+        // if the rename works, the path is as expected
+        // TODO: sync system call
+        std::fs::rename(temp_path, owner.local_path())
            .with_context(|| format!("rename temporary file as correct path for {owner}"))?;

        Ok(ResidentLayer { downloaded, owner })
@@ -367,10 +366,7 @@ impl Layer {
            .0
            .get_or_maybe_download(true, Some(ctx))
            .await
-            .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
+            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;

        self.0
            .access_stats
@@ -388,23 +384,6 @@ impl Layer {
            })
    }

-    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
-    #[cfg(test)]
-    pub(crate) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        let layer = self
-            .0
-            .get_or_maybe_download(true, Some(ctx))
-            .await
-            .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
-        layer.load_key_values(&self.0, ctx).await
-    }
-
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
@@ -1179,11 +1158,6 @@ impl LayerInner {
                let consecutive_failures =
                    1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);

-                if timeline.cancel.is_cancelled() {
-                    // If we're shutting down, drop out before logging the error
-                    return Err(e);
-                }
-
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");

                let backoff = utils::backoff::exponential_backoff_duration_seconds(
@@ -1774,20 +1748,6 @@ impl DownloadedLayer {
        }
    }

-    #[cfg(test)]
-    async fn load_key_values(
-        &self,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => d.load_key_values(ctx).await,
-            Image(i) => i.load_key_values(ctx).await,
-        }
-    }
-
    async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
        use LayerKind::*;
        match self.get(owner, ctx).await? {
@@ -1842,15 +1802,16 @@ impl ResidentLayer {
        use LayerKind::*;

        let owner = &self.owner.0;
+
        match self.downloaded.get(owner, ctx).await? {
            Delta(ref d) => {
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
                owner
                    .access_stats
                    .record_access(LayerAccessKind::KeyIter, ctx);

+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
                    .with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1859,23 +1820,6 @@ impl ResidentLayer {
        }
    }

-    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
-    /// the provided writer.  Return the number of keys written.
-    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
-    pub(crate) async fn filter<'a>(
-        &'a self,
-        shard_identity: &ShardIdentity,
-        writer: &mut ImageLayerWriter,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<usize> {
-        use LayerKind::*;
-
-        match self.downloaded.get(&self.owner.0, ctx).await? {
-            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
-            Image(i) => i.filter(shard_identity, writer, ctx).await,
-        }
-    }
-
    /// Returns the amount of keys and values written to the writer.
    pub(crate) async fn copy_delta_prefix(
        &self,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -815,7 +815,6 @@ async fn eviction_cancellation_on_drop() {
 /// A test case to remind you the cost of these structures. You can bump the size limit
 /// below if it is really necessary to add more fields to the structures.
 #[test]
-#[cfg(target_arch = "x86_64")]
 fn layer_size() {
    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,7 +17,7 @@ use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{backoff, completion, pausable_failpoint};
+use utils::{backoff, completion};

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
@@ -380,28 +380,21 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                let res = tenant
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                    .await;
-                match res {
-                    Ok(_) => {
-                        error_run_count = 0;
-                        period
-                    }
-                    Err(crate::tenant::GcError::TenantCancelled) => {
-                        return;
-                    }
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-
-                        error!(
+                if let Err(e) = res {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    error!(
                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                    );
-                        wait_duration
-                    }
+                    wait_duration
+                } else {
+                    error_run_count = 0;
+                    period
                }
            };

--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;`