neon_local: pin pageservers and endpoints

pageserver: skip the blk range check: this is slow (reads relsize page) if queries aren't
hitting latest=true
2026-01-25 14:20:38 +00:00 · 2024-01-03 16:14:58 +00:00 · 2023-12-20 20:06:56 +00:00 · 2023-12-20 20:06:46 +00:00 · 2023-12-20 20:05:06 +00:00 · 2023-12-20 20:03:23 +00:00
194 changed files with 3536 additions and 9666 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +0,0 @@
-[profile.default]
-slow-timeout = "1m"
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,7 @@ on:
    #          │ │ ┌───────────── day of the month (1 - 31)
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:   '0 3 * * *' # run once a day, timezone is utc
+    - cron:  '0 3 * * *' # run once a day, timezone is utc

  workflow_dispatch: # adds ability to run this manually
    inputs:
@@ -23,21 +23,6 @@ on:
        type: boolean
        description: 'Publish perf report. If not set, the report will be published only for the main branch'
        required: false
-      collect_olap_explain:
-        type: boolean
-        description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected'
-        required: false
-        default: false
-      collect_pg_stat_statements:
-        type: boolean
-        description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected'
-        required: false
-        default: false
-      run_AWS_RDS_AND_AURORA:
-        type: boolean
-        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
-        required: false
-        default: false

 defaults:
  run:
@@ -128,8 +113,6 @@ jobs:
    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-    env:
-      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
    runs-on: ubuntu-latest
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -169,7 +152,7 @@ jobs:
          ]
        }'

-        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
+        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                   { "platform": "rds-aurora"   }]')
        fi
@@ -188,9 +171,9 @@ jobs:
          ]
        }'

-        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
+        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                    { "platform": "rds-aurora",   "scale": "10" }]')
+                                                   { "platform": "rds-aurora",   "scale": "10" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -354,8 +337,6 @@ jobs:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
-      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
-      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
@@ -418,8 +399,6 @@ jobs:
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
-        TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        TEST_OLAP_SCALE: 10

--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -1,105 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-    outputs:
-      build-tools-tag:
-        description: "tag generated for build tools"
-        value: ${{ jobs.tag.outputs.build-tools-tag }}
-
-jobs:
-  check-if-build-tools-dockerfile-changed:
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
-            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
-            exit
-          fi
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  tag:
-    runs-on: ubuntu-latest
-    needs: [ check-if-build-tools-dockerfile-changed ]
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        env:
-          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  kaniko:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
-
-  kaniko-arm:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-  manifest:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - tag
-      - kaniko
-      - kaniko-arm
-      - check-if-build-tools-dockerfile-changed
-
-    steps:
-      - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,6 +44,7 @@ jobs:

        exit 1

+
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
@@ -73,19 +74,11 @@ jobs:
        shell: bash
        id: build-tag

-  build-buildtools-image:
-    needs: [ check-permissions ]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
-    with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
-    secrets: inherit
-
  check-codestyle-python:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -105,20 +98,20 @@ jobs:
      - name: Install Python deps
        run: ./scripts/pysync

-      - name: Run `ruff check` to ensure code format
-        run: poetry run ruff check .
+      - name: Run ruff to ensure code format
+        run: poetry run ruff .

-      - name: Run `ruff format` to ensure code format
-        run: poetry run ruff format --check .
+      - name: Run black to ensure code format
+        run: poetry run black --diff --check .

      - name: Run mypy to check types
        run: poetry run mypy .

  check-codestyle-rust:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -182,10 +175,10 @@ jobs:
        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    strategy:
      fail-fast: false
@@ -339,16 +332,16 @@ jobs:
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

-      - name: Run rust tests
+      - name: Run cargo test
        run: |
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -358,7 +351,7 @@ jobs:
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure

      - name: Install rust binaries
        run: |
@@ -415,10 +408,10 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-neon, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
    strategy:
@@ -454,10 +447,10 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image ]
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -486,12 +479,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}

    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -533,10 +526,11 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests ]
+
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    strategy:
      fail-fast: false
@@ -700,7 +694,7 @@ jobs:
            }"

  neon-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
@@ -739,7 +733,6 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -750,7 +743,7 @@ jobs:

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -785,7 +778,6 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -796,7 +788,7 @@ jobs:
        run: rm -rf ~/.ecr

  compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -844,7 +836,6 @@ jobs:
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
@@ -866,7 +857,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.21.0
+      VM_BUILDER_VERSION: v0.19.0

    steps:
      - name: Checkout
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -218,7 +218,7 @@ jobs:

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -1,130 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Restore previous tag if needed
-        run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
-
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ __pycache__/
 test_output/
 .vscode
 .idea
-neon.iml
 /.neon
 /integration_tests/.neon

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,17 +70,3 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)

 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
-
-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
-
-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
-
-```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -190,9 +190,9 @@ dependencies = [

 [[package]]
 name = "async-compression"
-version = "0.4.5"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
+checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
 dependencies = [
 "flate2",
 "futures-core",
@@ -1161,7 +1161,6 @@ dependencies = [
 "flate2",
 "futures",
 "hyper",
- "nix 0.26.2",
 "notify",
 "num_cpus",
 "opentelemetry",
@@ -1169,10 +1168,8 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest",
- "rust-ini",
 "serde",
 "serde_json",
- "signal-hook",
 "tar",
 "tokio",
 "tokio-postgres",
@@ -1204,26 +1201,6 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"

-[[package]]
-name = "const-random"
-version = "0.1.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
-dependencies = [
- "const-random-macro",
-]
-
-[[package]]
-name = "const-random-macro"
-version = "0.1.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
-dependencies = [
- "getrandom 0.2.11",
- "once_cell",
- "tiny-keccak",
-]
-
 [[package]]
 name = "const_fn"
 version = "0.4.9"
@@ -1456,12 +1433,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "crunchy"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
-
 [[package]]
 name = "crypto-bigint"
 version = "0.4.9"
@@ -1604,15 +1575,6 @@ dependencies = [
 "syn 2.0.32",
 ]

-[[package]]
-name = "dlv-list"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
-dependencies = [
- "const-random",
-]
-
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2539,14 +2501,13 @@ dependencies = [

 [[package]]
 name = "jsonwebtoken"
-version = "9.2.0"
+version = "8.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
+checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
 dependencies = [
 "base64 0.21.1",
- "js-sys",
- "pem 3.0.3",
- "ring 0.17.6",
+ "pem 1.1.1",
+ "ring 0.16.20",
 "serde",
 "serde_json",
 "simple_asn1",
@@ -3081,16 +3042,6 @@ dependencies = [
 "tokio-stream",
 ]

-[[package]]
-name = "ordered-multimap"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
-dependencies = [
- "dlv-list",
- "hashbrown 0.14.0",
-]
-
 [[package]]
 name = "os_info"
 version = "3.7.0"
@@ -3376,19 +3327,18 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"

 [[package]]
 name = "pem"
-version = "2.0.1"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
+checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8"
 dependencies = [
- "base64 0.21.1",
- "serde",
+ "base64 0.13.1",
 ]

 [[package]]
 name = "pem"
-version = "3.0.3"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
+checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
 dependencies = [
 "base64 0.21.1",
 "serde",
@@ -4264,16 +4214,6 @@ dependencies = [
 "unicode-ident",
 ]

-[[package]]
-name = "rust-ini"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
-dependencies = [
- "cfg-if",
- "ordered-multimap",
-]
-
 [[package]]
 name = "rustc-demangle"
 version = "0.1.23"
@@ -4405,14 +4345,12 @@ dependencies = [
 "async-stream",
 "aws-config",
 "aws-sdk-s3",
- "aws-smithy-async",
 "bincode",
 "bytes",
 "chrono",
 "clap",
 "crc32c",
 "either",
- "futures",
 "futures-util",
 "hex",
 "histogram",
@@ -4451,7 +4389,6 @@ dependencies = [
 "clap",
 "const_format",
 "crc32c",
- "fail",
 "fs2",
 "futures",
 "git-version",
@@ -4475,7 +4412,6 @@ dependencies = [
 "serde",
 "serde_json",
 "serde_with",
- "sha2",
 "signal-hook",
 "storage_broker",
 "thiserror",
@@ -4528,12 +4464,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"

 [[package]]
 name = "sct"
-version = "0.7.1"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
+checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring 0.16.20",
+ "untrusted 0.7.1",
 ]

 [[package]]
@@ -5232,15 +5168,6 @@ dependencies = [
 "time-core",
 ]

-[[package]]
-name = "tiny-keccak"
-version = "2.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
-dependencies = [
- "crunchy",
-]
-
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -5884,7 +5811,6 @@ dependencies = [
 "chrono",
 "const_format",
 "criterion",
- "fail",
 "futures",
 "heapless",
 "hex",
@@ -6409,7 +6335,6 @@ dependencies = [
 "futures-io",
 "futures-sink",
 "futures-util",
- "getrandom 0.2.11",
 "hex",
 "hmac",
 "hyper",
@@ -6421,7 +6346,6 @@ dependencies = [
 "num-bigint",
 "num-integer",
 "num-traits",
- "once_cell",
 "prost",
 "rand 0.8.5",
 "regex",
@@ -6524,28 +6448,30 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"

 [[package]]
 name = "zstd"
-version = "0.13.0"
+version = "0.12.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
+checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
 dependencies = [
 "zstd-safe",
 ]

 [[package]]
 name = "zstd-safe"
-version = "7.0.0"
+version = "6.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e"
+checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
 dependencies = [
+ "libc",
 "zstd-sys",
 ]

 [[package]]
 name = "zstd-sys"
-version = "2.0.9+zstd.1.5.5"
+version = "2.0.8+zstd.1.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
+checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
 dependencies = [
 "cc",
+ "libc",
 "pkg-config",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -93,7 +93,7 @@ hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
-jsonwebtoken = "9"
+jsonwebtoken = "8"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
--- a/2
+++ b/2
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned

 # Build Postgres
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -1,166 +0,0 @@
-FROM debian:bullseye-slim
-
-# Add nonroot user
-RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
-
-# System deps
-RUN set -e \
-    && apt update \
-    && apt install -y \
-        autoconf \
-        automake \
-        bison \
-        build-essential \
-        ca-certificates \
-        cmake \
-        curl \
-        flex \
-        git \
-        gnupg \
-        gzip \
-        jq \
-        libcurl4-openssl-dev \
-        libbz2-dev \
-        libffi-dev \
-        liblzma-dev \
-        libncurses5-dev \
-        libncursesw5-dev \
-        libpq-dev \
-        libreadline-dev \
-        libseccomp-dev \
-        libsqlite3-dev \
-        libssl-dev \
-        libstdc++-10-dev \
-        libtool \
-        libxml2-dev \
-        libxmlsec1-dev \
-        libxxhash-dev \
-        lsof \
-        make \
-        netcat \
-        net-tools \
-        openssh-client \
-        parallel \
-        pkg-config \
-        unzip \
-        wget \
-        xz-utils \
-        zlib1g-dev \
-        zstd \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# protobuf-compiler (protoc)
-ENV PROTOC_VERSION 25.1
-RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
-    && unzip -q protoc.zip -d protoc \
-    && mv protoc/bin/protoc /usr/local/bin/protoc \
-    && mv protoc/include/google /usr/local/include/google \
-    && rm -rf protoc.zip protoc
-
-# LLVM
-ENV LLVM_VERSION=17
-RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
-    && apt update \
-    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
-    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# PostgreSQL 14
-RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
-    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
-    && apt update \
-    && apt install -y postgresql-client-14 \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# AWS CLI
-RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
-    && unzip -q awscliv2.zip \
-    && ./aws/install \
-    && rm awscliv2.zip
-
-# Mold: A Modern Linker
-ENV MOLD_VERSION v2.4.0
-RUN set -e \
-    && git clone https://github.com/rui314/mold.git \
-    && mkdir mold/build \
-    && cd mold/build \
-    && git checkout ${MOLD_VERSION} \
-    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
-    && cmake --build . -j $(nproc) \
-    && cmake --install . \
-    && cd .. \
-    && rm -rf mold
-
-# LCOV
-# Build lcov from a fork:
-# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
-# And patches from us:
-# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
-    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
-    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
-    && cd lcov \
-    && make install \
-    && rm -rf ../lcov.tar.gz
-
-# Switch to nonroot user
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Python
-ENV PYTHON_VERSION=3.9.2 \
-    PYENV_ROOT=/home/nonroot/.pyenv \
-    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
-RUN set -e \
-    && cd $HOME \
-    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
-    && chmod +x pyenv-installer \
-    && ./pyenv-installer \
-    && export PYENV_ROOT=/home/nonroot/.pyenv \
-    && export PATH="$PYENV_ROOT/bin:$PATH" \
-    && export PATH="$PYENV_ROOT/shims:$PATH" \
-    && pyenv install ${PYTHON_VERSION} \
-    && pyenv global ${PYTHON_VERSION} \
-    && python --version \
-    && pip install --upgrade pip \
-    && pip --version \
-    && pip install pipenv wheel poetry
-
-# Switch to nonroot user (again)
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Rust
-# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.74.0
-ENV RUSTUP_HOME="/home/nonroot/.rustup"
-ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
-	chmod +x rustup-init && \
-	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
-	rm rustup-init && \
-    export PATH="$HOME/.cargo/bin:$PATH" && \
-    . "$HOME/.cargo/env" && \
-    cargo --version && rustup --version && \
-    rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny && \
-    cargo install cargo-hack && \
-    cargo install cargo-nextest && \
-    rm -rf /home/nonroot/.cargo/registry && \
-    rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot
-
-# Show versions
-RUN whoami \
-    && python --version \
-    && pip --version \
-    && cargo --version --verbose \
-    && rustup --version --verbose \
-    && rustc --version --verbose \
-    && clang --version
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG

@@ -48,29 +48,7 @@ RUN cd postgres && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
-    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
-    # In vanilla postgres this function is limited to Postgres role superuser.
-    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
-    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
-    # so we do it here.
-    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
-    # the first loop is for pg_stat_statement extension version <= 1.6
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if echo "$old_list" | grep -q -F "$filename"; then \
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-        fi; \
-    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7, 
-    # where pg_stat_statement_reset() got 3 additional arguments
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if ! echo "$old_list" | grep -q -F "$filename"; then \
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-        fi; \
-    done      
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control

 #########################################################################################
 #
@@ -591,23 +569,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control

-#########################################################################################
-#
-# Layer "pg-semver-pg-build"
-# compile pg_semver extension
-#
-#########################################################################################
-FROM build-deps AS pg-semver-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
-    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
-
 #########################################################################################
 #
 # Layer "pg-embedding-pg-build"
@@ -807,7 +768,6 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -13,7 +13,6 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
-nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
@@ -21,7 +20,6 @@ postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-signal-hook.workspace = true
 tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
@@ -39,6 +37,5 @@ workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
-zstd = "0.13"
+zstd = "0.12.4"
 bytes = "1.0"
-rust-ini = "0.20.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,31 +31,25 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r http://pg-ext-s3-gateway \
-//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
-//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
+//!             -r http://pg-ext-s3-gateway
 //! ```
 //!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
-use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
-use signal_hook::consts::{SIGQUIT, SIGTERM};
-use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;

-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -71,13 +65,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
-    thread::spawn(move || {
-        for sig in signals.forever() {
-            handle_exit_signal(sig);
-        }
-    });
-
    let build_tag = option_env!("BUILD_TAG")
        .unwrap_or(BUILD_TAG_DEFAULT)
        .to_string();
@@ -112,9 +99,6 @@ fn main() -> Result<()> {
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

-    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
-    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
-
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -225,8 +209,6 @@ fn main() -> Result<()> {
        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
-        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
-        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
    };
    let compute = Arc::new(compute_node);

@@ -357,7 +339,6 @@ fn main() -> Result<()> {
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
-        PG_PID.store(0, Ordering::SeqCst);
        info!("Postgres exited with code {}, shutting down", ecode);
        exit_code = ecode.code()
    }
@@ -512,41 +493,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("pgbouncer-connstr")
-                .long("pgbouncer-connstr")
-                .default_value(
-                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
-                )
-                .value_name("PGBOUNCER_CONNSTR"),
-        )
-        .arg(
-            Arg::new("pgbouncer-ini-path")
-                .long("pgbouncer-ini-path")
-                // Note: this doesn't match current path for pgbouncer.ini.
-                // Until we fix it, we need to pass the path explicitly
-                // or this will be effectively no-op.
-                .default_value("/etc/pgbouncer.ini")
-                .value_name("PGBOUNCER_INI_PATH"),
-        )
-}
-
-/// When compute_ctl is killed, send also termination signal to sync-safekeepers
-/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
-/// wait for termination which would be easy then.
-fn handle_exit_signal(sig: i32) {
-    info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
-    exit(1);
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -6,10 +6,7 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::atomic::AtomicU32;
-use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
-use std::thread;
 use std::time::Instant;

 use anyhow::{Context, Result};
@@ -36,9 +33,6 @@ use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};

-pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
-pub static PG_PID: AtomicU32 = AtomicU32::new(0);
-
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
    // Url type maintains proper escaping
@@ -70,10 +64,6 @@ pub struct ComputeNode {
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
-    // connection string to pgbouncer to change settings
-    pub pgbouncer_connstr: Option<String>,
-    // path to pgbouncer.ini to change settings
-    pub pgbouncer_ini_path: Option<String>,
 }

 // store some metrics about download size that might impact startup time
@@ -506,7 +496,6 @@ impl ComputeNode {
            .stdout(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");
-        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);

        // `postgres --sync-safekeepers` will print all log output to stderr and
        // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
@@ -514,7 +503,6 @@ impl ComputeNode {
        let sync_output = sync_handle
            .wait_with_output()
            .expect("postgres --sync-safekeepers failed");
-        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);

        if !sync_output.status.success() {
            anyhow::bail!(
@@ -669,7 +657,6 @@ impl ComputeNode {
            })
            .spawn()
            .expect("cannot start postgres process");
-        PG_PID.store(pg.id(), Ordering::SeqCst);

        wait_for_postgres(&mut pg, pgdata_path)?;

@@ -750,31 +737,6 @@ impl ComputeNode {
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
-
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
-            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
-                if let Err(err) = res {
-                    error!("error while tuning pgbouncer: {err:?}");
-                }
-            });
-        }
-
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -829,32 +791,6 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        // tune pgbouncer
-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
-
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
-            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
-                if let Err(err) = res {
-                    error!("error while tuning pgbouncer: {err:?}");
-                }
-            });
-        }
-
        info!(
            "start_compute spec.remote_extensions {:?}",
            pspec.spec.remote_extensions
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info, warn};
+use tracing::{debug, info};

 use crate::compute::ComputeNode;

@@ -84,29 +84,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    }
                }

-                // If there are existing (logical) walsenders, do not suspend.
-                //
-                // walproposer doesn't currently show up in pg_stat_replication,
-                // but protect if it will be
-                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
-                match cli.query_one(ws_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_ws) => {
-                            if num_ws > 0 {
-                                last_active = Some(Utc::now());
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse ws count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of walsenders: {:?}", e);
-                        continue;
-                    }
-                }
-
                // Update the last activity in the shared state if we got a more recent one.
                let mut state = compute.state.lock().unwrap();
                // NB: `Some(<DateTime>)` is always greater than `None`.
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -9,11 +9,9 @@ use std::process::Child;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
-use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use tokio_postgres::NoTls;
-use tracing::{debug, error, info, instrument};
+use tracing::{debug, instrument};

 use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};

@@ -361,68 +359,3 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {

    Ok(())
 }
-
-/// Update pgbouncer.ini with provided options
-pub fn update_pgbouncer_ini(
-    pgbouncer_config: HashMap<String, String>,
-    pgbouncer_ini_path: &str,
-) -> Result<()> {
-    let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
-    let section = conf.section_mut(Some("pgbouncer")).unwrap();
-
-    for (option_name, value) in pgbouncer_config.iter() {
-        section.insert(option_name, value);
-    }
-
-    conf.write_to_file(pgbouncer_ini_path)?;
-    Ok(())
-}
-
-/// Tune pgbouncer.
-/// 1. Apply new config using pgbouncer admin console
-/// 2. Add new values to pgbouncer.ini to preserve them after restart
-pub async fn tune_pgbouncer(
-    pgbouncer_settings: Option<HashMap<String, String>>,
-    pgbouncer_connstr: &str,
-    pgbouncer_ini_path: Option<String>,
-) -> Result<()> {
-    if let Some(pgbouncer_config) = pgbouncer_settings {
-        // Apply new config
-        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
-        let (client, connection) = connect_result.unwrap();
-        tokio::spawn(async move {
-            if let Err(e) = connection.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-
-        for (option_name, value) in pgbouncer_config.iter() {
-            info!(
-                "Applying pgbouncer setting change: {} = {}",
-                option_name, value
-            );
-            let query = format!("SET {} = {}", option_name, value);
-
-            let result = client.simple_query(&query).await;
-
-            info!("Applying pgbouncer setting change: {}", query);
-            info!("pgbouncer setting change result: {:?}", result);
-
-            if let Err(err) = result {
-                // Don't fail on error, just print it into log
-                error!(
-                    "Failed to apply pgbouncer setting change: {},  {}",
-                    query, err
-                );
-            };
-        }
-
-        // save values to pgbouncer.ini
-        // so that they are preserved after pgbouncer restart
-        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
-            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
-        }
-    }
-
-    Ok(())
-}
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -370,49 +370,33 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
    Ok(())
 }

-fn reassign_owned_objects_in_one_db(
-    conf: Config,
-    role_name: &PgIdent,
-    db_owner: &PgIdent,
-) -> Result<()> {
-    let mut client = conf.connect(NoTls)?;
-
-    // This will reassign all dependent objects to the db owner
-    let reassign_query = format!(
-        "REASSIGN OWNED BY {} TO {}",
-        role_name.pg_quote(),
-        db_owner.pg_quote()
-    );
-    info!(
-        "reassigning objects owned by '{}' in db '{}' to '{}'",
-        role_name,
-        conf.get_dbname().unwrap_or(""),
-        db_owner
-    );
-    client.simple_query(&reassign_query)?;
-
-    // This now will only drop privileges of the role
-    let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
-    client.simple_query(&drop_query)?;
-    Ok(())
-}
-
 // Reassign all owned objects in all databases to the owner of the database.
 fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
    for db in &spec.cluster.databases {
        if db.owner != *role_name {
            let mut conf = Config::from_str(connstr)?;
            conf.dbname(&db.name);
-            reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
+
+            let mut client = conf.connect(NoTls)?;
+
+            // This will reassign all dependent objects to the db owner
+            let reassign_query = format!(
+                "REASSIGN OWNED BY {} TO {}",
+                role_name.pg_quote(),
+                db.owner.pg_quote()
+            );
+            info!(
+                "reassigning objects owned by '{}' in db '{}' to '{}'",
+                role_name, &db.name, &db.owner
+            );
+            client.simple_query(&reassign_query)?;
+
+            // This now will only drop privileges of the role
+            let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
+            client.simple_query(&drop_query)?;
        }
    }

-    // Also handle case when there are no databases in the spec.
-    // In this case we need to reassign objects in the default database.
-    let conf = Config::from_str(connstr)?;
-    let db_owner = PgIdent::from_str("cloud_admin")?;
-    reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
-
    Ok(())
 }

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,8 +46,6 @@ use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
-use nix::sys::signal::kill;
-use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -441,14 +439,11 @@ impl Endpoint {
        Ok(())
    }

-    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
+    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
-        if send_sigterm {
-            kill(pid, Signal::SIGTERM).ok();
-        }
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
        Ok(())
    }
@@ -542,7 +537,6 @@ impl Endpoint {
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
            remote_extensions,
-            pgbouncer_settings: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -555,8 +549,10 @@ impl Endpoint {

        // Launch compute_ctl
        println!("Starting postgres node at '{}'", self.connstr());
-        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        cmd.args(["--http-port", &self.http_address.port().to_string()])
+        let mut cmd = Command::new("/usr/bin/taskset");
+        cmd.args(["-c".to_string(), "8-11".to_string()])
+            .args([self.env.neon_distrib_dir.join("compute_ctl")])
+            .args(["--http-port", &self.http_address.port().to_string()])
            .args(["--pgdata", self.pgdata().to_str().unwrap()])
            .args(["--connstr", &self.connstr()])
            .args([
@@ -738,15 +734,10 @@ impl Endpoint {
            &None,
        )?;

-        // Also wait for the compute_ctl process to die. It might have some
-        // cleanup work to do after postgres stops, like syncing safekeepers,
-        // etc.
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
        //
-        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
-        // want this cleanup: tests intentionally do stop when majority of
-        // safekeepers is down, so sync-safekeepers would hang otherwise. This
-        // could be a separate flag though.
-        self.wait_for_compute_ctl_to_exit(destroy)?;
+        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
            println!(
                "Destroying postgres data directory '{}'",
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -12,6 +12,7 @@ use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
+use std::str::FromStr;
 use std::time::Duration;

 use anyhow::{bail, Context};
@@ -216,11 +217,19 @@ impl PageServerNode {
        if update_config {
            args.push(Cow::Borrowed("--update-config"));
        }
+
+        let mut taskset_args = vec![
+            "-c".to_string(),
+            format!("{}", self.conf.id.0 - 1),
+            self.env.pageserver_bin().to_string_lossy().into(),
+        ];
+        taskset_args.extend(args.into_iter().map(|a| a.to_string()));
+
        background_process::start_process(
            "pageserver",
            &datadir,
-            &self.env.pageserver_bin(),
-            args.iter().map(Cow::as_ref),
+            &PathBuf::from_str("/usr/bin/taskset").unwrap(),
+            taskset_args,
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
            || async {
@@ -485,13 +494,6 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_id).await?)
    }

-    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .tenant_secondary_download(*tenant_id)
-            .await?)
-    }
-
    pub async fn timeline_create(
        &self,
        tenant_id: TenantId,
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,7 +11,6 @@ use crate::{
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
-use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -41,9 +40,9 @@ async fn await_lsn(
    loop {
        let latest = match get_lsns(tenant_id, pageserver).await {
            Ok(l) => l,
-            Err(_e) => {
+            Err(e) => {
                println!(
-                    "🕑 Waiting for pageserver {} to activate...",
+                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
                    pageserver.conf.id
                );
                std::thread::sleep(Duration::from_millis(500));
@@ -90,7 +89,7 @@ pub async fn migrate_tenant(
    tenant_id: TenantId,
    dest_ps: PageServerNode,
 ) -> anyhow::Result<()> {
-    println!("🤔 Checking existing status...");
+    // Get a new generation
    let attachment_service = AttachmentService::from_env(env);

    fn build_location_config(
@@ -136,20 +135,6 @@ pub async fn migrate_tenant(
        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
    }

-    println!(
-        "🔁 Downloading latest layers to destination pageserver {}",
-        dest_ps.conf.id
-    );
-    match dest_ps
-        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
-        .await
-    {
-        Ok(()) => {}
-        Err(_) => {
-            println!("  (skipping, destination wasn't in secondary mode)")
-        }
-    }
-
    let gen = attachment_service
        .attach_hook(tenant_id, dest_ps.conf.id)
        .await?;
--- a/deny.toml
+++ b/deny.toml
@@ -35,7 +35,6 @@ allow = [
    "Artistic-2.0",
    "BSD-2-Clause",
    "BSD-3-Clause",
-    "CC0-1.0",
    "ISC",
    "MIT",
    "MPL-2.0",
--- a/docs/rfcs/029-getpage-throttling.md
+++ b/docs/rfcs/029-getpage-throttling.md
@@ -1,197 +0,0 @@
-# Per-Tenant GetPage@LSN Throttling
-
-Author: Christian Schwarz
-Date: Oct 24, 2023
-
-## Summary
-
-This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver
-and the interactions with its client, i.e., the neon_smgr component in Compute.
-
-The result of implementing & executing this RFC will be a fleet-wide upper limit for
-**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**.
-
-## Background
-
-### GetPage@LSN Request Flow
-
-Pageserver exposes its `page_service.rs` as a libpq listener.
-The Computes' `neon_smgr` module connects to that libpq listener.
-Once a connection is established, the protocol allows Compute to request page images at a given LSN.
-We call these requests GetPage@LSN requests, or GetPage requests for short.
-Other request types can be sent, but these are low traffic compared to GetPage requests
-and are not the concern of this RFC.
-
-Pageserver associates one libpq connection with one tokio task.
-
-Per connection/task, the pq protocol is handled by the common `postgres_backend` crate.
-Its `run_message_loop` function invokes the `page_service` specific `impl<IO> postgres_backend::Handler<IO> for PageServerHandler`.
-Requests are processed in the order in which they arrive via the TCP-based pq protocol.
-So, there is no concurrent request processing within one connection/task.
-
-There is a degree of natural pipelining:
-Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream.
-And Pageserver can fill the pipe with responses in the other direction.
-Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc.
-
-### GetPage@LSN Access Pattern
-
-The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC).
-Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches.
-
-If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*.
-
-## Motivation
-
-In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h,
-then dropping to ca 18k GetPage/second for a period of 9h.
-
-We noticed this because of an internal GetPage latency SLO burn rate alert, i.e.,
-the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO.
-
-Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants.
-
-However, here are some illustrative data points for the 155k period:
-The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance.
-We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`)
-The CPU utilization of the instance was 75% user+system.
-Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%.
-The hit rate for materialized pages was ca. 40%.
-Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100.
-
-The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**.
-The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM.
-The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**.
-
-My claim is that it was **unhealthy to serve this workload at the pace we did**:
-* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this)
-* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons:
-    * **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops.
-      At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit.
-      The result is an **uneven** performance profile from the Compute perspective.
-
-    * **economics**: Neon currently does not charge for IOPS, only capacity.
-      **We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.**
-      For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume.
-      Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume.
-      We charge 0$.
-      It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free.
-
-
-Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits:
-vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver.
-So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity.
-
-## Solution: Throttling GetPage Requests
-
-**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**.
-That is, unless we want to start charging for provisioned GetPage@LSN/second.
-Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size.
-Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913).
-
-## The Design Space
-
-What that remains is the question about *policy* and *mechanism*:
-
-**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant.
-Candidates are:
-
-* hard limit, same limit value per connection|timeline|tenant
-    * Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance.
-      This is a major operational pain point / risk right now.
-* hard limit, configurable per connection|timeline|tenant
-    * This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers.
-    * Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant.
-* fair share among active connections|timelines|tenants per instance
-    * example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity
-    * NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance
-* ...
-
-
-Regarding **mechanism**, it's clear that **backpressure** is the way to go.
-However, we must choose between
-* **implicit** backpressure through pq/TCP and
-* **explicit** rejection of requests + retries with exponential backoff
-
-Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**:
-where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling?
-
-And when we eventually move the measurement point into the Computes (to avoid coordinated omission),
-how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO?
-
-## Scope Of This RFC
-
-**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**.
-
-This proposal is easy to implement and significantly de-risks operating large Pageservers,
-based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants.
-
-For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate.
-Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate.
-If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation.
-
-The mechanism for backpressure will be TCP-based implicit backpressure.
-The compute team isn't concerned about prefetch queue depth.
-Pageserver will implement it by delaying the reading of requests from the libpq connection(s).
-
-The rate limit will be implemented using a per-tenant token bucket.
-The bucket will be be shared among all connections to the tenant.
-The bucket implementation supports starvation-preventing `await`ing.
-The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/).
-The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771
-can be used to evaluate the overhead of sharing the bucket among connections of a tenant.
-A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler.
-
-Regarding metrics / the internal GetPage latency SLO:
-we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by:
- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver)
- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver)
-
-Further observability measures:
- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute.
-  The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats.
-
-Rollout will happen as follows:
- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf
- experimentation in staging and later production to study impact & interaction with auto-scaling
- determination of a sensible global default value
-  - the value will be chosen as high as possible ...
-  - ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance.
- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default
- reset of the experimental per-tenant overrides
- gain experience & lower the limit over time
-  - we stop lowering the limit as soon as this RFC's goal is achieved, i.e.,
-    once we decide that in practice the chosen value sufficiently de-risks operating large pageservers
-
-The per-tenant override will remain for emergencies and testing.
-But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant.
-
-Toward the upper layers of the Neon stack, the resulting limit will be
-**"the highest GetPage/second that Pageserver can support for a single tenant"**.
-
-### Rationale
-
-We decided against error + retry because of worries about starvation.
-
-## Future Work
-
-Enable per-tenant emergency override of the limit via Console.
-Should be part of a more general framework to specify tenant config overrides.
-**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users,
-or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that
-concerns itself with GetPage/second capacity planning.
-
-Compute-side metrics for GetPage latency.
-
-Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled.
-
-Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss.
-
-Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant.
-Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance.
-With per-tenant rate limiting, we will not meet that expectation.
-However, we can currently only scale per tenant.
-Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis.
-But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit.
-To solve this properly, I think we'll need replicas for tenants / shard.
-To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
-We force code formatting via `ruff`, and type hints via `mypy`.
+We force code formatting via `black`, `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):

 ```bash
-poetry run ruff format . # All code is reformatted
-poetry run ruff check .  # Python linter
-poetry run mypy .        # Ensure there are no typing errors
+poetry run black .  # All code is reformatted
+poetry run ruff .  # Python linter
+poetry run mypy .  # Ensure there are no typing errors
 ```

 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -73,8 +73,6 @@ pub struct ComputeSpec {

    // information about available remote extensions
    pub remote_extensions: Option<RemoteExtSpec>,
-
-    pub pgbouncer_settings: Option<HashMap<String, String>>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -243,9 +243,5 @@
        "public_extensions": [
          "postgis"
        ]
-      },
-      "pgbouncer_settings": {
-        "default_pool_size": "42",
-        "pool_mode": "session"
      }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -142,7 +142,7 @@ impl Key {
 }

 pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
+    key.field1 == 0x00 && key.field4 != 0
 }

 impl std::str::FromStr for Key {
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -124,9 +124,6 @@ impl KeySpaceAccum {
                if range.start == accum.end {
                    accum.end = range.end;
                } else {
-                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
-                    // a new range here if the skipped region was all keys that don't belong on this shard.
-                    // (https://github.com/neondatabase/neon/issues/6247)
                    assert!(range.start > accum.end);
                    self.ranges.push(accum.clone());
                    *accum = range;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -557,6 +557,19 @@ pub enum DownloadRemoteLayersTaskState {
    ShutDown,
 }

+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -163,7 +163,7 @@ impl From<[u8; 18]> for TenantShardId {
 /// shard we're dealing with, but do not need to know the full ShardIdentity (because
 /// we won't be doing any page->shard mapping), and do not need to know the fully qualified
 /// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
@@ -422,21 +422,6 @@ impl ShardIdentity {
        }
    }

-    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
-    pub fn is_key_disposable(&self, key: &Key) -> bool {
-        if key_is_shard0(key) {
-            // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A: because the WAL ingestion logic currently ingests some shard 0
-            //    content on all shards, even though it's only read on shard 0.  If we
-            //    dropped it, then subsequent WAL ingest to these keys would encounter
-            //    an error.
-            false
-        } else {
-            !self.is_key_local(key)
-        }
-    }
-
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -530,7 +515,12 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // In this condition:
+    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
+    // all metadata.
+    // - field6 is set to -1 for relation size pages.
+    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,12 +35,6 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
-    /// Query handler indicated that client should reconnect
-    #[error("Server requested reconnect")]
-    Reconnect,
-    /// Query named an entity that was not found
-    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
    /// Authentication failure
    #[error("Unauthorized: {0}")]
    Unauthorized(std::borrow::Cow<'static, str>),
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                info!("Stopped due to shutdown");
                Ok(())
            }
-            Err(QueryError::Reconnect) => {
-                // Dropping out of this loop implicitly disconnects
-                info!("Stopped due to handler reconnect request");
-                Ok(())
-            }
            Err(QueryError::Disconnected(e)) => {
                info!("Disconnected ({e:#})");
                // Disconnection is not an error: we just use it that way internally to drop
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Reconnect => "reconnect".to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
-        QueryError::NotFound(_) => "not found".to_string(),
        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::SimulatedConnectionError => {
            error!("query handler for query '{query}' failed due to a simulated connection error")
        }
-        QueryError::Reconnect => {
-            info!("query handler for '{query}' requested client to reconnect")
-        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
-        QueryError::NotFound(reason) => {
-            info!("query handler for '{query}' entity not found: {reason}")
-        }
        QueryError::Unauthorized(e) => {
            warn!("query handler for '{query}' failed with authentication error: {e}");
        }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -117,8 +117,6 @@ impl AzureBlobStorage {
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

-        let mut etag = None;
-        let mut last_modified = None;
        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
@@ -126,13 +124,6 @@ impl AzureBlobStorage {
        let mut bufs = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
-            let etag_str: &str = part.blob.properties.etag.as_ref();
-            if etag.is_none() {
-                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
-            }
-            if last_modified.is_none() {
-                last_modified = Some(part.blob.properties.last_modified.into());
-            }
            if let Some(blob_meta) = part.blob.metadata {
                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
            }
@@ -145,8 +136,6 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
-            etag,
-            last_modified,
            metadata: Some(StorageMetadata(metadata)),
        })
    }
@@ -322,12 +311,6 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(())
    }
-
-    async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
-        Err(anyhow::anyhow!(
-            "copy for azure blob storage is not implemented"
-        ))
-    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -14,9 +14,7 @@ mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -207,18 +205,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
-
-    /// Copy a remote object inside a bucket from one path to another.
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
 }

-pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
 pub struct Download {
-    pub download_stream: DownloadStream,
-    /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: Option<SystemTime>,
-    /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Option<String>,
+    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -377,15 +367,6 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
-
-    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => s.copy(from, to).await,
-            Self::AwsS3(s) => s.copy(from, to).await,
-            Self::AzureBlob(s) => s.copy(from, to).await,
-            Self::Unreliable(s) => s.copy(from, to).await,
-        }
-    }
 }

 impl GenericRemoteStorage {
@@ -672,7 +653,6 @@ impl ConcurrencyLimiter {
            RequestKind::Put => &self.write,
            RequestKind::List => &self.read,
            RequestKind::Delete => &self.write,
-            RequestKind::Copy => &self.write,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -331,8 +331,6 @@ impl RemoteStorage for LocalFs {
                .map_err(DownloadError::Other)?;
            Ok(Download {
                metadata,
-                last_modified: None,
-                etag: None,
                download_stream: Box::pin(source),
            })
        } else {
@@ -374,17 +372,17 @@ impl RemoteStorage for LocalFs {
                .await
                .map_err(DownloadError::Other)?;

-            let download_stream: DownloadStream = match end_exclusive {
-                Some(end_exclusive) => Box::pin(ReaderStream::new(
-                    source.take(end_exclusive - start_inclusive),
-                )),
-                None => Box::pin(ReaderStream::new(source)),
-            };
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream,
+            Ok(match end_exclusive {
+                Some(end_exclusive) => Download {
+                    metadata,
+                    download_stream: Box::pin(ReaderStream::new(
+                        source.take(end_exclusive - start_inclusive),
+                    )),
+                },
+                None => Download {
+                    metadata,
+                    download_stream: Box::pin(ReaderStream::new(source)),
+                },
            })
        } else {
            Err(DownloadError::NotFound)
@@ -409,20 +407,6 @@ impl RemoteStorage for LocalFs {
        }
        Ok(())
    }
-
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let from_path = from.with_base(&self.storage_root);
-        let to_path = to.with_base(&self.storage_root);
-        create_target_directory(&to_path).await?;
-        fs::copy(&from_path, &to_path).await.with_context(|| {
-            format!(
-                "Failed to copy file from '{from_path}' to '{to_path}'",
-                from_path = from_path,
-                to_path = to_path
-            )
-        })?;
-        Ok(())
-    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,7 +16,6 @@ use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
    meta::credentials::CredentialsProviderChain,
-    profile::ProfileFileCredentialsProvider,
    provider_config::ProviderConfig,
    retry::{RetryConfigBuilder, RetryMode},
    web_identity_token::WebIdentityTokenCredentialsProvider,
@@ -75,29 +74,20 @@ impl S3Bucket {

        let region = Some(Region::new(aws_config.bucket_region.clone()));

-        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
            // needed to access remote extensions bucket
-            .or_else(
-                "token",
+            .or_else("token", {
+                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
                WebIdentityTokenCredentialsProvider::builder()
                    .configure(&provider_conf)
-                    .build(),
-            )
+                    .build()
+            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };
@@ -228,11 +218,17 @@ impl S3Bucket {

        let started_at = ScopeGuard::into_inner(started_at);

+        if get_object.is_err() {
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                kind,
+                AttemptOutcome::Err,
+                started_at,
+            );
+        }
+
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
-                let etag = object_output.e_tag.clone();
-                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());

                let body = object_output.body;
                let body = ByteStreamAsStream::from(body);
@@ -241,33 +237,15 @@ impl S3Bucket {

                Ok(Download {
                    metadata,
-                    etag,
-                    last_modified,
                    download_stream: Box::pin(body),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
-                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
-                // an error: we expect to sometimes fetch an object and find it missing,
-                // e.g. when probing for timeline indices.
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Ok,
-                    started_at,
-                );
                Err(DownloadError::NotFound)
            }
-            Err(e) => {
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Err,
-                    started_at,
-                );
-
-                Err(DownloadError::Other(
-                    anyhow::Error::new(e).context("download s3 object"),
-                ))
-            }
+            Err(e) => Err(DownloadError::Other(
+                anyhow::Error::new(e).context("download s3 object"),
+            )),
        }
    }
 }
@@ -493,38 +471,6 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let kind = RequestKind::Copy;
-        let _guard = self.permit(kind).await;
-
-        let started_at = start_measuring_requests(kind);
-
-        // we need to specify bucket_name as a prefix
-        let copy_source = format!(
-            "{}/{}",
-            self.bucket_name,
-            self.relative_path_to_s3_object(from)
-        );
-
-        let res = self
-            .client
-            .copy_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(to))
-            .copy_source(copy_source)
-            .send()
-            .await;
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
-        Ok(())
-    }
-
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        // if prefix is not none then download file `prefix/from`
        // if prefix is none then download file `from`
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -11,7 +11,6 @@ pub(crate) enum RequestKind {
    Put = 1,
    Delete = 2,
    List = 3,
-    Copy = 4,
 }

 use RequestKind::*;
@@ -23,7 +22,6 @@ impl RequestKind {
            Put => "put_object",
            Delete => "delete_object",
            List => "list_objects",
-            Copy => "copy_object",
        }
    }
    const fn as_index(&self) -> usize {
@@ -31,7 +29,7 @@ impl RequestKind {
    }
 }

-pub(super) struct RequestTyped<C>([C; 5]);
+pub(super) struct RequestTyped<C>([C; 4]);

 impl<C> RequestTyped<C> {
    pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -40,8 +38,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy].into_iter();
-        let arr = std::array::from_fn::<C, 5, _>(|index| {
+        let mut it = [Get, Put, Delete, List].into_iter();
+        let arr = std::array::from_fn::<C, 4, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -162,11 +162,4 @@ impl RemoteStorage for UnreliableWrapper {
        }
        Ok(())
    }
-
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        // copy is equivalent to download + upload
-        self.attempt(RemoteOp::Download(from.clone()))?;
-        self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.copy_object(from, to).await
-    }
 }
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -1,200 +0,0 @@
-use std::collections::HashSet;
-use std::ops::ControlFlow;
-use std::path::PathBuf;
-use std::sync::Arc;
-
-use anyhow::Context;
-use bytes::Bytes;
-use camino::Utf8Path;
-use futures::stream::Stream;
-use once_cell::sync::OnceCell;
-use remote_storage::{Download, GenericRemoteStorage, RemotePath};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
-
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
-
-pub(crate) fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-pub(crate) fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}
-
-pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result<Vec<u8>> {
-    let mut buf = Vec::new();
-    tokio::io::copy_buf(
-        &mut tokio_util::io::StreamReader::new(dl.download_stream),
-        &mut buf,
-    )
-    .await?;
-    Ok(buf)
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-pub(crate) async fn upload_simple_remote_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} remote files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
-
-pub(crate) async fn cleanup(
-    client: &Arc<GenericRemoteStorage>,
-    objects_to_delete: HashSet<RemotePath>,
-) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-pub(crate) struct Uploads {
-    pub(crate) prefixes: HashSet<RemotePath>,
-    pub(crate) blobs: HashSet<RemotePath>,
-}
-
-pub(crate) async fn upload_remote_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} remote files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-pub(crate) fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-            utils::logging::Output::Stdout,
-        )
-        .expect("logging init failed");
-    });
-}
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
+use once_cell::sync::OnceCell;
 use remote_storage::{
-    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
 use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};

-mod common;
-
-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";

@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
 /// where
 /// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -97,7 +97,7 @@ async fn azure_pagination_should_work(
 /// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `Azure_pagination_should_work` for more information.
 ///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -218,9 +218,18 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res

    ctx.client.upload(data, len, &path, None).await?;

+    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
+        let mut buf = Vec::new();
+        tokio::io::copy_buf(
+            &mut tokio_util::io::StreamReader::new(dl.download_stream),
+            &mut buf,
+        )
+        .await?;
+        Ok(buf)
+    }
    // Normal download request
    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
+    let buf = download_and_compare(dl).await?;
    assert_eq!(&buf, &orig);

    // Full range (end specified)
@@ -228,12 +237,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
        .client
        .download_byte_range(&path, 0, Some(len as u64))
        .await?;
-    let buf = download_to_vec(dl).await?;
+    let buf = download_and_compare(dl).await?;
    assert_eq!(&buf, &orig);

    // partial range (end specified)
    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
+    let buf = download_and_compare(dl).await?;
    assert_eq!(&buf, &orig[4..10]);

    // partial range (end beyond real end)
@@ -241,17 +250,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
        .client
        .download_byte_range(&path, 8, Some(len as u64 * 100))
        .await?;
-    let buf = download_to_vec(dl).await?;
+    let buf = download_and_compare(dl).await?;
    assert_eq!(&buf, &orig[8..]);

    // Partial range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
+    let buf = download_and_compare(dl).await?;
    assert_eq!(&buf, &orig[4..]);

    // Full range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
+    let buf = download_and_compare(dl).await?;
    assert_eq!(&buf, &orig);

    debug!("Cleanup: deleting file at path {path:?}");
@@ -263,6 +272,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
    Ok(())
 }

+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+    });
+}
+
 struct EnabledAzure {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -332,7 +352,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {

        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;

-        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");

@@ -394,7 +414,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {

        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;

-        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");

@@ -458,3 +478,166 @@ fn create_azure_client(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
+
+struct Uploads {
+    prefixes: HashSet<RemotePath>,
+    blobs: HashSet<RemotePath>,
+}
+
+async fn upload_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+async fn upload_simple_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
+
+// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
+// to binary
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
+use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};

-mod common;
-
-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -95,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `s3_pagination_should_work` for more information.
 ///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -198,65 +198,15 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
    Ok(())
 }

-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let MaybeEnabledS3::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+    });
 }

 struct EnabledS3 {
@@ -328,7 +278,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {

        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;

-        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");

@@ -390,7 +340,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {

        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;

-        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");

@@ -453,3 +403,166 @@ fn create_s3_client(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
+
+struct Uploads {
+    prefixes: HashSet<RemotePath>,
+    blobs: HashSet<RemotePath>,
+}
+
+async fn upload_s3_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} S3 files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+async fn upload_simple_s3_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} S3 files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
+
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
    #[serde(default)]
    pub http_connstr: Option<String>,
 }
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineCopyRequest {
-    pub target_timeline_id: TimelineId,
-    pub until_lsn: Lsn,
-}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,12 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
-# which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
-
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
@@ -22,7 +16,6 @@ chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
-fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
 nix.workspace = true
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -31,9 +31,6 @@ pub enum ApiError {
    #[error("Shutting down")]
    ShuttingDown,

-    #[error("Timeout")]
-    Timeout(Cow<'static, str>),
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -70,10 +67,6 @@ impl ApiError {
                err.to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
-            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
-                StatusCode::REQUEST_TIMEOUT,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -83,10 +83,6 @@ pub mod timeout;

 pub mod sync;

-pub mod failpoint_support;
-
-pub mod yielding_loop;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -367,8 +367,6 @@ impl MonotonicCounter<Lsn> for RecordLsn {
 }

 /// Implements  [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
-///
-/// This is used by the `pagebench` pageserver benchmarking tool.
 pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);

 impl rand::distributions::uniform::SampleUniform for Lsn {
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,12 +15,6 @@ pub struct Gate {
    name: String,
 }

-impl std::fmt::Debug for Gate {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Gate<{}>", self.name)
-    }
-}
-
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -1,35 +0,0 @@
-use tokio_util::sync::CancellationToken;
-
-#[derive(thiserror::Error, Debug)]
-pub enum YieldingLoopError {
-    #[error("Cancelled")]
-    Cancelled,
-}
-
-/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
-/// yields to avoid blocking the executor, and after resuming checks the provided
-/// cancellation token to drop out promptly on shutdown.
-#[inline(always)]
-pub async fn yielding_loop<I, T, F>(
-    interval: usize,
-    cancel: &CancellationToken,
-    iter: I,
-    mut visitor: F,
-) -> Result<(), YieldingLoopError>
-where
-    I: Iterator<Item = T>,
-    F: FnMut(T),
-{
-    for (i, item) in iter.enumerate() {
-        visitor(item);
-
-        if i + 1 % interval == 0 {
-            tokio::task::yield_now().await;
-            if cancel.is_cancelled() {
-                return Err(YieldingLoopError::Cancelled);
-            }
-        }
-    }
-
-    Ok(())
-}
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -1,2 +1 @@
-#include "postgres.h"
 #include "walproposer.h"
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -8,12 +8,12 @@ use std::ffi::CString;

 use crate::bindings::uint32;
 use crate::bindings::walproposer_api;
-use crate::bindings::NeonWALReadResult;
 use crate::bindings::PGAsyncReadResult;
 use crate::bindings::PGAsyncWriteResult;
 use crate::bindings::Safekeeper;
 use crate::bindings::Size;
 use crate::bindings::StringInfoData;
+use crate::bindings::TimeLineID;
 use crate::bindings::TimestampTz;
 use crate::bindings::WalProposer;
 use crate::bindings::WalProposerConnStatusType;
@@ -178,11 +178,31 @@ extern "C" fn conn_blocking_write(
    }
 }

-extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C" fn recovery_download(
+    sk: *mut Safekeeper,
+    _timeline: TimeLineID,
+    startpos: XLogRecPtr,
+    endpos: XLogRecPtr,
+) -> bool {
    unsafe {
        let callback_data = (*(*(*sk).wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).recovery_download(&mut (*wp), &mut (*sk))
+        (*api).recovery_download(&mut (*sk), startpos, endpos)
+    }
+}
+
+#[allow(clippy::unnecessary_cast)]
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+) {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_read(&mut (*sk), buf, startptr)
    }
 }

@@ -194,28 +214,11 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
    }
 }

-#[allow(clippy::unnecessary_cast)]
-extern "C" fn wal_read(
-    sk: *mut Safekeeper,
-    buf: *mut ::std::os::raw::c_char,
-    startptr: XLogRecPtr,
-    count: Size,
-    _errmsg: *mut *mut ::std::os::raw::c_char,
-) -> NeonWALReadResult {
+extern "C" fn free_event_set(wp: *mut WalProposer) {
    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        // TODO: errmsg is not forwarded
-        (*api).wal_read(&mut (*sk), buf, startptr)
-    }
-}
-
-extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_reader_events(&mut (*sk))
+        (*api).free_event_set(&mut (*wp));
    }
 }

@@ -235,14 +238,6 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
    }
 }

-extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).active_state_update_event_set(&mut (*sk));
-    }
-}
-
 extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
    unsafe {
        let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -251,14 +246,6 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
    }
 }

-extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).rm_safekeeper_event_set(&mut (*sk));
-    }
-}
-
 extern "C" fn wait_event_set(
    wp: *mut WalProposer,
    timeout: ::std::os::raw::c_long,
@@ -326,6 +313,14 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
    }
 }

+extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).confirm_wal_streamed(&mut (*wp), lsn)
+    }
+}
+
 extern "C" fn log_internal(
    wp: *mut WalProposer,
    level: ::std::os::raw::c_int,
@@ -340,6 +335,14 @@ extern "C" fn log_internal(
    }
 }

+extern "C" fn after_election(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).after_election(&mut (*wp))
+    }
+}
+
 #[derive(Debug)]
 pub enum Level {
    Debug5,
@@ -398,20 +401,20 @@ pub(crate) fn create_api() -> walproposer_api {
        conn_async_write: Some(conn_async_write),
        conn_blocking_write: Some(conn_blocking_write),
        recovery_download: Some(recovery_download),
-        wal_reader_allocate: Some(wal_reader_allocate),
        wal_read: Some(wal_read),
-        wal_reader_events: Some(wal_reader_events),
+        wal_reader_allocate: Some(wal_reader_allocate),
+        free_event_set: Some(free_event_set),
        init_event_set: Some(init_event_set),
        update_event_set: Some(update_event_set),
-        active_state_update_event_set: Some(active_state_update_event_set),
        add_safekeeper_event_set: Some(add_safekeeper_event_set),
-        rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
        wait_event_set: Some(wait_event_set),
        strong_random: Some(strong_random),
        get_redo_start_lsn: Some(get_redo_start_lsn),
        finish_sync_safekeepers: Some(finish_sync_safekeepers),
        process_safekeeper_feedback: Some(process_safekeeper_feedback),
+        confirm_wal_streamed: Some(confirm_wal_streamed),
        log_internal: Some(log_internal),
+        after_election: Some(after_election),
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
 use crate::{
    api_bindings::{create_api, take_vec_u8, Level},
    bindings::{
-        NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
-        WalProposerFree, WalProposerStart,
+        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
+        WalProposerStart,
    },
 };

@@ -86,19 +86,19 @@ pub trait ApiImpl {
        todo!()
    }

-    fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
+    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
        todo!()
    }

-    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
        todo!()
    }

-    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
        todo!()
    }

-    fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
+    fn free_event_set(&self, _wp: &mut WalProposer) {
        todo!()
    }

@@ -110,18 +110,10 @@ pub trait ApiImpl {
        todo!()
    }

-    fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
        todo!()
    }

-    fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
        todo!()
    }
@@ -142,6 +134,10 @@ pub trait ApiImpl {
        todo!()
    }

+    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
+        todo!()
+    }
+
    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
        todo!()
    }
@@ -244,7 +240,6 @@ impl Drop for Wrapper {

 #[cfg(test)]
 mod tests {
-    use core::panic;
    use std::{
        cell::Cell,
        sync::{atomic::AtomicUsize, mpsc::sync_channel},
@@ -252,7 +247,7 @@ mod tests {

    use utils::id::TenantTimelineId;

-    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
+    use crate::{api_bindings::Level, walproposer::Wrapper};

    use super::ApiImpl;

@@ -360,17 +355,12 @@ mod tests {
            true
        }

-        fn recovery_download(
-            &self,
-            _wp: &mut crate::bindings::WalProposer,
-            _sk: &mut crate::bindings::Safekeeper,
-        ) -> bool {
-            true
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("wal_reader_allocate")
        }

-        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
-            println!("wal_reader_allocate");
-            crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
+        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("free_event_set")
        }

        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
@@ -393,13 +383,6 @@ mod tests {
            self.wait_events.set(WaitEventsData { sk, event_mask });
        }

-        fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
-            println!(
-                "rm_safekeeper_event_set, sk={:?}",
-                sk as *mut crate::bindings::Safekeeper
-            );
-        }
-
        fn wait_event_set(
            &self,
            _: &mut crate::bindings::WalProposer,
@@ -425,7 +408,7 @@ mod tests {
        }

        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("wp_log[{}] {}", level, msg);
+            println!("walprop_log[{}] {}", level, msg);
        }

        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,7 +13,6 @@ use bytes::{Buf, Bytes};
 use pageserver::{
    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
-use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};

 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -27,9 +26,9 @@ fn redo_scenarios(c: &mut Criterion) {

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
-    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+    let tenant_id = TenantId::generate();

-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+    let manager = PostgresRedoManager::new(conf, tenant_id);

    let manager = Arc::new(manager);

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,12 +1,10 @@
-use pageserver_api::{models::*, shard::TenantShardId};
+use pageserver_api::models::*;
 use reqwest::{IntoUrl, Method};
 use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
 };

-pub mod util;
-
 #[derive(Debug)]
 pub struct Client {
    mgmt_api_endpoint: String,
@@ -164,18 +162,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{}/secondary/download",
-            self.mgmt_api_endpoint, tenant_id
-        );
-        self.request(Method::POST, &uri, ())
-            .await?
-            .error_for_status()
-            .map(|_| ())
-            .map_err(|e| Error::ApiError(format!("{}", e)))
-    }
-
    pub async fn location_config(
        &self,
        tenant_id: TenantId,
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -115,8 +115,15 @@ impl PagestreamClient {

    pub async fn getpage(
        &mut self,
-        req: PagestreamGetPageRequest,
+        key: RelTagBlockNo,
+        lsn: Lsn,
    ) -> anyhow::Result<PagestreamGetPageResponse> {
+        let req = PagestreamGetPageRequest {
+            latest: false,
+            rel: key.rel_tag,
+            blkno: key.block_no,
+            lsn,
+        };
        let req = PagestreamFeMessage::GetPage(req);
        let req: bytes::Bytes = req.serialize();
        // let mut req = tokio_util::io::ReaderStream::new(&req);
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,7 +1,6 @@
 use anyhow::Context;
 use pageserver_client::page_service::BasebackupRequest;

-use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

 use rand::prelude::*;
@@ -16,6 +15,8 @@ use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Instant;

+use crate::cli;
+use crate::util::tenant_timeline_id::TenantTimelineId;
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

@@ -80,9 +81,9 @@ async fn main_impl(
    ));

    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+    let timelines: Vec<TenantTimelineId> = cli::targets::discover(
        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
+        cli::targets::Spec {
            limit_to_first_n_targets: args.limit_to_first_n_targets,
            targets: args.targets.clone(),
        },
--- a/pageserver/pagebench/src/cli.rs
+++ b/pageserver/pagebench/src/cli.rs
@@ -0,0 +1 @@
+pub(crate) mod targets;
--- a/pageserver/pagebench/src/util/cli/targets.rs
+++ b/pageserver/pagebench/src/util/cli/targets.rs
@@ -2,7 +2,10 @@ use std::sync::Arc;

 use pageserver_client::mgmt_api;
 use tracing::info;
-use utils::id::TenantTimelineId;
+
+use crate::util::{
+    discover_timelines::get_pageserver_tenant_timelines, tenant_timeline_id::TenantTimelineId,
+};

 pub(crate) struct Spec {
    pub(crate) limit_to_first_n_targets: Option<usize>,
@@ -16,7 +19,7 @@ pub(crate) async fn discover(
    let mut timelines = if let Some(targets) = spec.targets {
        targets
    } else {
-        mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
+        get_pageserver_tenant_timelines(api_client).await?
    };

    if let Some(limit) = spec.limit_to_first_n_targets {
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -3,9 +3,8 @@ use futures::future::join_all;
 use pageserver::pgdatadir_mapping::key_to_rel_block;
 use pageserver::repository;
 use pageserver_api::key::is_rel_block_key;
-use pageserver_api::models::PagestreamGetPageRequest;
+use pageserver_client::page_service::RelTagBlockNo;

-use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

 use rand::prelude::*;
@@ -21,6 +20,9 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

+use crate::cli;
+
+use crate::util::tenant_timeline_id::TenantTimelineId;
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

@@ -39,9 +41,6 @@ pub(crate) struct Args {
    runtime: Option<humantime::Duration>,
    #[clap(long)]
    per_target_rate_limit: Option<usize>,
-    /// Probability for sending `latest=true` in the request (uniform distribution).
-    #[clap(long, default_value = "1")]
-    req_latest_probability: f64,
    #[clap(long)]
    limit_to_first_n_targets: Option<usize>,
    targets: Option<Vec<TenantTimelineId>>,
@@ -97,9 +96,9 @@ async fn main_impl(
    ));

    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+    let timelines: Vec<TenantTimelineId> = cli::targets::discover(
        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
+        cli::targets::Spec {
            limit_to_first_n_targets: args.limit_to_first_n_targets,
            targets: args.targets.clone(),
        },
@@ -203,26 +202,22 @@ async fn main_impl(
            start_work_barrier.wait().await;

            loop {
-                let (timeline, req) = {
+                let (range, key) = {
                    let mut rng = rand::thread_rng();
                    let r = &all_ranges[weights.sample(&mut rng)];
                    let key: i128 = rng.gen_range(r.start..r.end);
                    let key = repository::Key::from_i128(key);
+                    if key.field6 == 0xffffffff {
+                        // Hack around bug
+                        continue;
+                    }
                    let (rel_tag, block_no) =
                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                    (
-                        r.timeline,
-                        PagestreamGetPageRequest {
-                            latest: rng.gen_bool(args.req_latest_probability),
-                            lsn: r.timeline_lsn,
-                            rel: rel_tag,
-                            blkno: block_no,
-                        },
-                    )
+                    (r, RelTagBlockNo { rel_tag, block_no })
                };
-                let sender = work_senders.get(&timeline).unwrap();
+                let sender = work_senders.get(&range.timeline).unwrap();
                // TODO: what if this blocks?
-                sender.send(req).await.ok().unwrap();
+                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
            }
        }),
        Some(rps_limit) => Box::pin(async move {
@@ -251,21 +246,16 @@ async fn main_impl(
                    );
                    loop {
                        ticker.tick().await;
-                        let req = {
+                        let (range, key) = {
                            let mut rng = rand::thread_rng();
                            let r = &ranges[weights.sample(&mut rng)];
                            let key: i128 = rng.gen_range(r.start..r.end);
                            let key = repository::Key::from_i128(key);
                            let (rel_tag, block_no) = key_to_rel_block(key)
                                .expect("we filter non-rel-block keys out above");
-                            PagestreamGetPageRequest {
-                                latest: rng.gen_bool(args.req_latest_probability),
-                                lsn: r.timeline_lsn,
-                                rel: rel_tag,
-                                blkno: block_no,
-                            }
+                            (r, RelTagBlockNo { rel_tag, block_no })
                        };
-                        sender.send(req).await.ok().unwrap();
+                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
                    }
                })
            };
@@ -319,7 +309,7 @@ async fn client(
    args: &'static Args,
    timeline: TenantTimelineId,
    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
+    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
    all_work_done_barrier: Arc<Barrier>,
    live_stats: Arc<LiveStats>,
 ) {
@@ -333,10 +323,10 @@ async fn client(
        .await
        .unwrap();

-    while let Some(req) = work.recv().await {
+    while let Some((key, lsn)) = work.recv().await {
        let start = Instant::now();
        client
-            .getpage(req)
+            .getpage(key, lsn)
            .await
            .with_context(|| format!("getpage for {timeline}"))
            .unwrap();
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -1,31 +1,19 @@
 use clap::Parser;
 use utils::logging;

-/// Re-usable pieces of code that aren't CLI-specific.
-mod util {
-    pub(crate) mod connstring;
-    pub(crate) mod request_stats;
-    #[macro_use]
-    pub(crate) mod tokio_thread_local_stats;
-    /// Re-usable pieces of CLI-specific code.
-    pub(crate) mod cli {
-        pub(crate) mod targets;
-    }
-}
+pub(crate) mod cli;
+pub(crate) mod util;

-/// The pagebench CLI sub-commands, dispatched in [`main`] below.
-mod cmd {
-    pub(super) mod basebackup;
-    pub(super) mod getpage_latest_lsn;
-    pub(super) mod trigger_initial_size_calculation;
-}
+mod basebackup;
+mod getpage_latest_lsn;
+mod trigger_initial_size_calculation;

 /// Component-level performance test for pageserver.
 #[derive(clap::Parser)]
 enum Args {
-    Basebackup(cmd::basebackup::Args),
-    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
-    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
+    Basebackup(basebackup::Args),
+    GetPageLatestLsn(getpage_latest_lsn::Args),
+    TriggerInitialSizeCalculation(trigger_initial_size_calculation::Args),
 }

 fn main() {
@@ -38,11 +26,9 @@ fn main() {

    let args = Args::parse();
    match args {
-        Args::Basebackup(args) => cmd::basebackup::main(args),
-        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
-        Args::TriggerInitialSizeCalculation(args) => {
-            cmd::trigger_initial_size_calculation::main(args)
-        }
+        Args::Basebackup(args) => basebackup::main(args),
+        Args::GetPageLatestLsn(args) => getpage_latest_lsn::main(args),
+        Args::TriggerInitialSizeCalculation(args) => trigger_initial_size_calculation::main(args),
    }
    .unwrap()
 }
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -2,7 +2,8 @@ use std::sync::Arc;

 use humantime::Duration;
 use tokio::task::JoinSet;
-use utils::id::TenantTimelineId;
+
+use crate::{cli, util::tenant_timeline_id::TenantTimelineId};

 #[derive(clap::Parser)]
 pub(crate) struct Args {
@@ -41,9 +42,9 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    ));

    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+    let timelines: Vec<TenantTimelineId> = cli::targets::discover(
        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
+        cli::targets::Spec {
            limit_to_first_n_targets: args.limit_to_first_n_targets,
            targets: args.targets.clone(),
        },
--- a/pageserver/pagebench/src/util.rs
+++ b/pageserver/pagebench/src/util.rs
@@ -0,0 +1,6 @@
+pub(crate) mod connstring;
+pub(crate) mod discover_timelines;
+pub(crate) mod request_stats;
+pub(crate) mod tenant_timeline_id;
+#[macro_use]
+pub(crate) mod tokio_thread_local_stats;
--- a/pageserver/pagebench/src/util/discover_timelines.rs
+++ b/pageserver/pagebench/src/util/discover_timelines.rs
@@ -1,17 +1,13 @@
-//! Helpers to do common higher-level tasks with the [`Client`].
-
 use std::sync::Arc;

+use pageserver_client::mgmt_api;
 use tokio::task::JoinSet;
-use utils::id::{TenantId, TenantTimelineId};
+use utils::id::TenantId;

-use super::Client;
+use super::tenant_timeline_id::TenantTimelineId;

-/// Retrieve a list of all of the pageserver's timelines.
-///
-/// Fails if there are sharded tenants present on the pageserver.
-pub async fn get_pageserver_tenant_timelines_unsharded(
-    api_client: &Arc<Client>,
+pub(crate) async fn get_pageserver_tenant_timelines(
+    api_client: &Arc<mgmt_api::Client>,
 ) -> anyhow::Result<Vec<TenantTimelineId>> {
    let mut timelines: Vec<TenantTimelineId> = Vec::new();
    let mut tenants: Vec<TenantId> = Vec::new();
--- a/pageserver/pagebench/src/util/tenant_timeline_id.rs
+++ b/pageserver/pagebench/src/util/tenant_timeline_id.rs
@@ -0,0 +1,34 @@
+use std::str::FromStr;
+
+use anyhow::Context;
+use utils::id::{TenantId, TimelineId};
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
+pub(crate) struct TenantTimelineId {
+    pub(crate) tenant_id: TenantId,
+    pub(crate) timeline_id: TimelineId,
+}
+
+impl FromStr for TenantTimelineId {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let (tenant_id, timeline_id) = s
+            .split_once('/')
+            .context("tenant and timeline id must be separated by `/`")?;
+        let tenant_id = TenantId::from_str(tenant_id)
+            .with_context(|| format!("invalid tenant id: {tenant_id:?}"))?;
+        let timeline_id = TimelineId::from_str(timeline_id)
+            .with_context(|| format!("invalid timeline id: {timeline_id:?}"))?;
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+        })
+    }
+}
+
+impl std::fmt::Display for TenantTimelineId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}/{}", self.tenant_id, self.timeline_id)
+    }
+}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,7 +23,6 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
-use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

@@ -175,7 +174,7 @@ where
        ] {
            for segno in self
                .timeline
-                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
+                .list_slru_segments(kind, self.lsn, self.ctx)
                .await?
            {
                self.add_slru_segment(kind, segno).await?;
@@ -193,7 +192,7 @@ where
            // Otherwise only include init forks of unlogged relations.
            let rels = self
                .timeline
-                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
@@ -268,7 +267,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .get_rel_size(src, self.lsn, false, self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -289,7 +288,7 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }
@@ -311,7 +310,7 @@ where
    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
+            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
            .await?;

        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -353,7 +352,7 @@ where
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
-                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                .await?;

            ensure!(
@@ -400,7 +399,7 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                    .await?
                    .is_empty()
            {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,7 +31,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
-use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
@@ -127,7 +126,7 @@ fn main() -> anyhow::Result<()> {
    }

    // Initialize up failpoints support
-    let scenario = failpoint_support::init();
+    let scenario = pageserver::failpoint_support::init();

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -37,8 +37,8 @@ use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
+    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,9 +75,6 @@ pub mod defaults {
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

    ///
    /// Default built-in configuration file.
@@ -91,7 +88,6 @@ pub mod defaults {
 #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
 #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'

-#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}

 # initial superuser role name to use when creating a new tenant
@@ -112,8 +108,6 @@ pub mod defaults {

 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'

-#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -131,7 +125,6 @@ pub mod defaults {
 #gc_feedback = false

 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
-#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

 [remote_storage]

@@ -240,13 +233,6 @@ pub struct PageServerConf {
    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
    /// heatmap uploads vs. other remote storage operations.
    pub heatmap_upload_concurrency: usize,
-
-    /// How many remote storage downloads may be done for secondary tenants concurrently.  Implicitly
-    /// deprioritises secondary downloads vs. remote storage operations for attached tenants.
-    pub secondary_download_concurrency: usize,
-
-    /// Maximum number of WAL records to be ingested and committed at the same time
-    pub ingest_batch_size: u64,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -328,9 +314,6 @@ struct PageServerConfigBuilder {
    control_plane_emergency_mode: BuilderValue<bool>,

    heatmap_upload_concurrency: BuilderValue<usize>,
-    secondary_download_concurrency: BuilderValue<usize>,
-
-    ingest_batch_size: BuilderValue<u64>,
 }

 impl Default for PageServerConfigBuilder {
@@ -403,9 +386,6 @@ impl Default for PageServerConfigBuilder {
            control_plane_emergency_mode: Set(false),

            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
-
-            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
        }
    }
 }
@@ -554,14 +534,6 @@ impl PageServerConfigBuilder {
        self.heatmap_upload_concurrency = BuilderValue::Set(value)
    }

-    pub fn secondary_download_concurrency(&mut self, value: usize) {
-        self.secondary_download_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
-        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -660,15 +632,10 @@ impl PageServerConfigBuilder {
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+
            heatmap_upload_concurrency: self
                .heatmap_upload_concurrency
                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
-            secondary_download_concurrency: self
-                .secondary_download_concurrency
-                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
-            ingest_batch_size: self
-                .ingest_batch_size
-                .ok_or(anyhow!("missing ingest_batch_size"))?,
        })
    }
 }
@@ -726,11 +693,6 @@ impl PageServerConf {
            .join(TENANT_LOCATION_CONFIG_NAME)
    }

-    pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id)
-            .join(TENANT_HEATMAP_BASENAME)
-    }
-
    pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TIMELINES_SEGMENT_NAME)
@@ -916,10 +878,6 @@ impl PageServerConf {
                "heatmap_upload_concurrency" => {
                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                },
-                "secondary_download_concurrency" => {
-                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -991,8 +949,6 @@ impl PageServerConf {
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
        }
    }
 }
@@ -1221,9 +1177,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1284,9 +1238,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: 100,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1516,7 +1468,6 @@ threshold = "20m"
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
            })
        );
        match &conf.default_tenant_conf.eviction_policy {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -74,45 +74,6 @@ pub struct DiskUsageEvictionTaskConfig {
    pub period: Duration,
    #[cfg(feature = "testing")]
    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
-    /// Select sorting for evicted layers
-    #[serde(default)]
-    pub eviction_order: EvictionOrder,
-}
-
-/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
-/// partitioning.
-#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "type", content = "args")]
-pub enum EvictionOrder {
-    /// Order the layers to be evicted by how recently they have been accessed in absolute
-    /// time.
-    ///
-    /// This strategy is unfair when some tenants grow faster than others towards the slower
-    /// growing.
-    #[default]
-    AbsoluteAccessed,
-
-    /// Order the layers to be evicted by how recently they have been accessed relatively within
-    /// the set of resident layers of a tenant.
-    ///
-    /// This strategy will evict layers more fairly but is untested.
-    RelativeAccessed {
-        #[serde(default)]
-        highest_layer_count_loses_first: bool,
-    },
-}
-
-impl EvictionOrder {
-    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
-    /// counts should be the first ones to have their layers evicted.
-    fn highest_layer_count_loses_first(&self) -> bool {
-        match self {
-            EvictionOrder::AbsoluteAccessed => false,
-            EvictionOrder::RelativeAccessed {
-                highest_layer_count_loses_first,
-            } => *highest_layer_count_loses_first,
-        }
-    }
 }

 #[derive(Default)]
@@ -231,14 +192,7 @@ async fn disk_usage_eviction_task_iteration(
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(
-        state,
-        storage,
-        usage_pre,
-        task_config.eviction_order,
-        cancel,
-    )
-    .await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -324,7 +278,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    _storage: &GenericRemoteStorage,
    usage_pre: U,
-    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
    // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -344,7 +297,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
+    let candidates = match collect_eviction_candidates(cancel).await? {
        EvictionCandidates::Cancelled => {
            return Ok(IterationOutcome::Cancelled);
        }
@@ -354,16 +307,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
-        let nth = i + 1;
        let desc = candidate.layer.layer_desc();
-        let total_candidates = candidates.len();
-        let size = desc.file_size;
-        let rel = candidate.relative_last_activity;
        debug!(
-            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
+            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
+            i + 1,
+            candidates.len(),
+            desc.file_size,
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
+            partition,
            desc.tenant_shard_id,
            desc.timeline_id,
            candidate.layer,
@@ -506,7 +459,6 @@ struct EvictionCandidate {
    timeline: Arc<Timeline>,
    layer: Layer,
    last_activity_ts: SystemTime,
-    relative_last_activity: finite_f32::FiniteF32,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -526,24 +478,24 @@ enum EvictionCandidates {
 /// order. A caller that evicts in that order, until pressure is relieved, implements
 /// the eviction policy outlined in the module comment.
 ///
-/// # Example with EvictionOrder::AbsoluteAccessed
+/// # Example
 ///
 /// Imagine that there are two tenants, A and B, with five layers each, a-e.
 /// Each layer has size 100, and both tenant's min_resident_size is 150.
 /// The eviction order would be
 ///
 /// ```text
-/// partition last_activity_ts tenant/layer
-/// Above     18:30            A/c
-/// Above     19:00            A/b
-/// Above     18:29            B/c
-/// Above     19:05            B/b
-/// Above     20:00            B/a
-/// Above     20:03            A/a
-/// Below     20:30            A/d
-/// Below     20:40            B/d
-/// Below     20:45            B/e
-/// Below     20:58            A/e
+/// partition last_activity_ts    tenant/layer
+/// Above     18:30               A/c
+/// Above     19:00               A/b
+/// Above     18:29               B/c
+/// Above     19:05               B/b
+/// Above     20:00               B/a
+/// Above     20:03               A/a
+/// Below     20:30               A/d
+/// Below     20:40               B/d
+/// Below     20:45               B/e
+/// Below     20:58               A/e
 /// ```
 ///
 /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
@@ -553,77 +505,7 @@ enum EvictionCandidates {
 /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
-///
-/// # Example with EvictionOrder::RelativeAccessed
-///
-/// ```text
-/// partition relative_age last_activity_ts tenant/layer
-/// Above     0/4          18:30            A/c
-/// Above     0/4          18:29            B/c
-/// Above     1/4          19:00            A/b
-/// Above     1/4          19:05            B/b
-/// Above     2/4          20:00            B/a
-/// Above     2/4          20:03            A/a
-/// Below     3/4          20:30            A/d
-/// Below     3/4          20:40            B/d
-/// Below     4/4          20:45            B/e
-/// Below     4/4          20:58            A/e
-/// ```
-///
-/// With tenants having the same number of layers the picture does not change much. The same with
-/// A having many more layers **resident** (not all of them listed):
-///
-/// ```text
-/// Above       0/100      18:30            A/c
-/// Above       0/4        18:29            B/c
-/// Above       1/100      19:00            A/b
-/// Above       2/100      20:03            A/a
-/// Above       3/100      20:03            A/nth_3
-/// Above       4/100      20:03            A/nth_4
-///             ...
-/// Above       1/4        19:05            B/b
-/// Above      25/100      20:04            A/nth_25
-///             ...
-/// Above       2/4        20:00            B/a
-/// Above      50/100      20:10            A/nth_50
-///             ...
-/// Below       3/4        20:40            B/d
-/// Below      99/100      20:30            A/nth_99
-/// Below       4/4        20:45            B/e
-/// Below     100/100      20:58            A/nth_100
-/// ```
-///
-/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
-/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
-/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
-/// appeared:
-///
-/// ```text
-/// Above       0/87       20:04            A/nth_23
-/// Above       0/3        19:05            B/b
-/// Above       0/50       20:59            C/nth_0
-/// Above       1/87       20:04            A/nth_24
-/// Above       1/50       21:00            C/nth_1
-/// Above       2/87       20:04            A/nth_25
-///             ...
-/// Above      16/50       21:02            C/nth_16
-/// Above       1/3        20:00            B/a
-/// Above      27/87       20:10            A/nth_50
-///             ...
-/// Below       2/3        20:40            B/d
-/// Below      49/50       21:05            C/nth_49
-/// Below      86/87       20:30            A/nth_99
-/// Below       3/3        20:45            B/e
-/// Below      50/50       21:05            C/nth_50
-/// Below      87/87       20:58            A/nth_100
-/// ```
-///
-/// Now relieving pressure with 23 layers would cost:
-/// - tenant A 14 layers
-/// - tenant B 1 layer
-/// - tenant C 8 layers
 async fn collect_eviction_candidates(
-    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
    // get a snapshot of the list of tenants
@@ -709,63 +591,12 @@ async fn collect_eviction_candidates(
        tenant_candidates
            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;
-
-        // keeping the -1 or not decides if every tenant should lose their least recently accessed
-        // layer OR if this should happen in the order of having highest layer count:
-        let fudge = if eviction_order.highest_layer_count_loses_first() {
-            // relative_age vs. tenant layer count:
-            // - 0.1..=1.0 (10 layers)
-            // - 0.01..=1.0 (100 layers)
-            // - 0.001..=1.0 (1000 layers)
-            //
-            // leading to evicting less of the smallest tenants.
-            0
-        } else {
-            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
-            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
-            // be that less than 10k layer evictions is enough, so we would not need to evict from
-            // all tenants.
-            //
-            // as the tenant ordering is now deterministic this could hit the same tenants
-            // disproportionetly on multiple invocations. alternative could be to remember how many
-            // layers did we evict last time from this tenant, and inject that as an additional
-            // fudge here.
-            1
-        };
-
-        let total = tenant_candidates
-            .len()
-            .checked_sub(fudge)
-            .filter(|&x| x > 0)
-            // support 0 or 1 resident layer tenants as well
-            .unwrap_or(1);
-        let divider = total as f32;
-
-        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
+        for (timeline, layer_info) in tenant_candidates.into_iter() {
            let file_size = layer_info.file_size();
-
-            // as we iterate this reverse sorted list, the most recently accessed layer will always
-            // be 1.0; this is for us to evict it last.
-            let relative_last_activity = if matches!(
-                eviction_order,
-                EvictionOrder::RelativeAccessed { .. }
-            ) {
-                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
-                // similarly for u16. unsure how it would help.
-                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
-                    .unwrap_or_else(|val| {
-                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
-                        finite_f32::FiniteF32::ZERO
-                    })
-            } else {
-                finite_f32::FiniteF32::ZERO
-            };
-
            let candidate = EvictionCandidate {
                timeline,
                last_activity_ts: layer_info.last_activity_ts,
                layer: layer_info.layer,
-                relative_last_activity,
            };
            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
@@ -779,19 +610,8 @@ async fn collect_eviction_candidates(

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
-
-    match eviction_order {
-        EvictionOrder::AbsoluteAccessed => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.last_activity_ts)
-            });
-        }
-        EvictionOrder::RelativeAccessed { .. } => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.relative_last_activity)
-            });
-        }
-    }
+    candidates
+        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));

    Ok(EvictionCandidates::Finished(candidates))
 }
@@ -820,66 +640,6 @@ impl std::ops::Deref for TimelineKey {
    }
 }

-/// A totally ordered f32 subset we can use with sorting functions.
-mod finite_f32 {
-
-    /// A totally ordered f32 subset we can use with sorting functions.
-    #[derive(Clone, Copy, PartialEq)]
-    pub struct FiniteF32(f32);
-
-    impl std::fmt::Debug for FiniteF32 {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            std::fmt::Debug::fmt(&self.0, f)
-        }
-    }
-
-    impl std::fmt::Display for FiniteF32 {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            std::fmt::Display::fmt(&self.0, f)
-        }
-    }
-
-    impl std::cmp::Eq for FiniteF32 {}
-
-    impl std::cmp::PartialOrd for FiniteF32 {
-        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-            Some(self.cmp(other))
-        }
-    }
-
-    impl std::cmp::Ord for FiniteF32 {
-        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-            self.0.total_cmp(&other.0)
-        }
-    }
-
-    impl TryFrom<f32> for FiniteF32 {
-        type Error = f32;
-
-        fn try_from(value: f32) -> Result<Self, Self::Error> {
-            if value.is_finite() {
-                Ok(FiniteF32(value))
-            } else {
-                Err(value)
-            }
-        }
-    }
-
-    impl FiniteF32 {
-        pub const ZERO: FiniteF32 = FiniteF32(0.0);
-
-        pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
-            if (0.0..=1.0).contains(&value) {
-                // -0.0 is within the range, make sure it is assumed 0.0..=1.0
-                let value = value.abs();
-                Ok(FiniteF32(value))
-            } else {
-                Err(value)
-            }
-        }
-    }
-}
-
 mod filesystem_level_usage {
    use anyhow::Context;
    use camino::Utf8Path;
@@ -961,7 +721,6 @@ mod filesystem_level_usage {

    #[test]
    fn max_usage_pct_pressure() {
-        use super::EvictionOrder;
        use super::Usage as _;
        use std::time::Duration;
        use utils::serde_percent::Percent;
@@ -973,7 +732,6 @@ mod filesystem_level_usage {
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: EvictionOrder::default(),
            },
            total_bytes: 100_000,
            avail_bytes: 0,
--- a/pageserver/src/failpoint_support.rs
+++ b/pageserver/src/failpoint_support.rs
@@ -1,14 +1,3 @@
-//! Failpoint support code shared between pageserver and safekeepers.
-
-use crate::http::{
-    error::ApiError,
-    json::{json_request, json_response},
-};
-use hyper::{Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
@@ -36,7 +25,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
 // Helper function used by the macro. (A function has nicer scoping so we
 // don't need to decorate everything with "::")
 #[doc(hidden)]
-pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
    let millis = duration_str.parse::<u64>().unwrap();
    let d = std::time::Duration::from_millis(millis);

@@ -82,7 +71,7 @@ pub fn init() -> fail::FailScenario<'static> {
    scenario
 }

-pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
    if actions == "exit" {
        fail::cfg_callback(name, exit_failpoint)
    } else {
@@ -95,45 +84,3 @@ fn exit_failpoint() {
    tracing::info!("Exit requested by failpoint");
    std::process::exit(1);
 }
-
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
-/// Configure failpoints through http.
-pub async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Cannot manage failpoints because storage was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow::anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -159,12 +159,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-        "412":
-          description: Deletion may not proceed, tenant is not in Active state
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/PreconditionFailedError"
        "500":
          description: Generic operation error
          content:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -25,7 +25,6 @@ use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
-use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -67,6 +66,9 @@ use utils::{
    lsn::Lsn,
 };

+// Imports only used for testing APIs
+use pageserver_api::models::ConfigureFailpointsRequest;
+
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
@@ -152,7 +154,6 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::AncestorStopping(_) => {
                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
-            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
@@ -307,7 +308,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
-            Cancelled => ApiError::ShuttingDown,
        }
    }
 }
@@ -593,6 +593,8 @@ async fn get_lsn_by_timestamp_handler(
        )));
    }

+    let version: Option<u8> = parse_query_param(&request, "version")?;
+
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -605,18 +607,31 @@ async fn get_lsn_by_timestamp_handler(
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
-    #[derive(serde::Serialize)]
-    struct Result {
-        lsn: Lsn,
-        kind: &'static str,
+
+    if version.unwrap_or(0) > 1 {
+        #[derive(serde::Serialize)]
+        struct Result {
+            lsn: Lsn,
+            kind: &'static str,
+        }
+        let (lsn, kind) = match result {
+            LsnForTimestamp::Present(lsn) => (lsn, "present"),
+            LsnForTimestamp::Future(lsn) => (lsn, "future"),
+            LsnForTimestamp::Past(lsn) => (lsn, "past"),
+            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+        };
+        json_response(StatusCode::OK, Result { lsn, kind })
+    } else {
+        // FIXME: this is a temporary crutch not to break backwards compatibility
+        // See https://github.com/neondatabase/neon/pull/5608
+        let result = match result {
+            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+            LsnForTimestamp::Future(_lsn) => "future".into(),
+            LsnForTimestamp::Past(_lsn) => "past".into(),
+            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+        };
+        json_response(StatusCode::OK, result)
    }
-    let (lsn, kind) = match result {
-        LsnForTimestamp::Present(lsn) => (lsn, "present"),
-        LsnForTimestamp::Future(lsn) => (lsn, "future"),
-        LsnForTimestamp::Past(lsn) => (lsn, "past"),
-        LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
-    };
-    json_response(StatusCode::OK, Result { lsn, kind })
 }

 async fn get_timestamp_of_lsn_handler(
@@ -886,9 +901,7 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    state
-        .tenant_manager
-        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
            shard = %tenant_shard_id.shard_slug()
@@ -1274,23 +1287,6 @@ async fn put_tenant_location_config_handler(
        // which is not a 400 but a 409.
        .map_err(ApiError::BadRequest)?;

-    if let Some(_flush_ms) = flush {
-        match state
-            .secondary_controller
-            .upload_tenant(tenant_shard_id)
-            .await
-        {
-            Ok(()) => {
-                tracing::info!("Uploaded heatmap during flush");
-            }
-            Err(e) => {
-                tracing::warn!("Failed to flush heatmap: {e}");
-            }
-        }
-    } else {
-        tracing::info!("No flush requested when configuring");
-    }
-
    json_response(StatusCode::OK, ())
 }

@@ -1309,6 +1305,34 @@ async fn handle_tenant_break(
    json_response(StatusCode::OK, ())
 }

+async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Cannot manage failpoints because pageserver was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
+
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
    mut request: Request<Body>,
@@ -1557,22 +1581,19 @@ async fn disk_usage_eviction_run(
    struct Config {
        /// How many bytes to evict before reporting that pressure is relieved.
        evict_bytes: u64,
-
-        #[serde(default)]
-        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
    }

    #[derive(Debug, Clone, Copy, serde::Serialize)]
    struct Usage {
        // remains unchanged after instantiation of the struct
-        evict_bytes: u64,
+        config: Config,
        // updated by `add_available_bytes`
        freed_bytes: u64,
    }

    impl crate::disk_usage_eviction_task::Usage for Usage {
        fn has_pressure(&self) -> bool {
-            self.evict_bytes > self.freed_bytes
+            self.config.evict_bytes > self.freed_bytes
        }

        fn add_available_bytes(&mut self, bytes: u64) {
@@ -1583,7 +1604,7 @@ async fn disk_usage_eviction_run(
    let config = json_request::<Config>(&mut r).await?;

    let usage = Usage {
-        evict_bytes: config.evict_bytes,
+        config,
        freed_bytes: 0,
    };

@@ -1598,11 +1619,7 @@ async fn disk_usage_eviction_run(
    let state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state,
-        storage,
-        usage,
-        config.eviction_order,
-        &cancel,
+        &state, storage, usage, &cancel,
    )
    .await;

@@ -1628,21 +1645,6 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

-async fn secondary_download_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
-        .secondary_controller
-        .download_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1911,9 +1913,6 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
-            api_handler(r, secondary_download_handler)
-        })
        .put("/v1/tenant/:tenant_shard_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -21,7 +21,6 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
-use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
@@ -313,16 +312,13 @@ async fn import_wal(
        waldecoder.feed_bytes(&buf);

        let mut nrecords = 0;
-        let mut modification = tline.begin_modification(last_lsn);
+        let mut modification = tline.begin_modification(endpoint);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
-                WAL_INGEST.records_committed.inc();
-
-                modification.commit(ctx).await?;
                last_lsn = lsn;

                nrecords += 1;
@@ -452,14 +448,13 @@ pub async fn import_wal_from_tar(

        waldecoder.feed_bytes(&bytes[offset..]);

-        let mut modification = tline.begin_modification(last_lsn);
+        let mut modification = tline.begin_modification(end_lsn);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
-                modification.commit(ctx).await?;
                last_lsn = lsn;

                debug!("imported record at {} (end {})", lsn, end_lsn);
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,6 +25,8 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+pub mod failpoint_support;
+
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
@@ -117,10 +119,6 @@ pub const TENANT_CONFIG_NAME: &str = "config";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";

-/// Per-tenant copy of their remote heatmap, downloaded into the local
-/// tenant path while in secondary mode.
-pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 // Metrics collected on operations on the storage repository.
 #[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
-pub(crate) enum StorageTimeOperation {
+pub enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
    LayerFlush,

@@ -55,7 +55,7 @@ pub(crate) enum StorageTimeOperation {
    CreateTenant,
 }

-pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
    register_counter_vec!(
        "pageserver_storage_operations_seconds_sum",
        "Total time spent on storage operations with operation, tenant and timeline dimensions",
@@ -64,7 +64,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_storage_operations_seconds_count",
        "Count of storage operations with operation, tenant and timeline dimensions",
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) struct PageCacheMetricsForTaskKind {
+pub struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,

@@ -159,7 +159,7 @@ pub(crate) struct PageCacheMetricsForTaskKind {
    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

-pub(crate) struct PageCacheMetrics {
+pub struct PageCacheMetrics {
    map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
 }

@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
        let task_kind: &'static str = task_kind.into();
@@ -243,9 +243,10 @@ impl PageCacheMetrics {
    }
 }

-pub(crate) struct PageCacheSizeMetrics {
+pub struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

+    pub current_bytes_ephemeral: UIntGauge,
    pub current_bytes_immutable: UIntGauge,
    pub current_bytes_materialized_page: UIntGauge,
 }
@@ -259,26 +260,31 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
-    Lazy::new(|| PageCacheSizeMetrics {
-        max_bytes: {
-            register_uint_gauge!(
-                "pageserver_page_cache_size_max_bytes",
-                "Maximum size of the page cache in bytes"
-            )
-            .expect("failed to define a metric")
-        },
-        current_bytes_immutable: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["immutable"])
-                .unwrap()
-        },
-        current_bytes_materialized_page: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["materialized_page"])
-                .unwrap()
-        },
-    });
+pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
+    max_bytes: {
+        register_uint_gauge!(
+            "pageserver_page_cache_size_max_bytes",
+            "Maximum size of the page cache in bytes"
+        )
+        .expect("failed to define a metric")
+    },
+
+    current_bytes_ephemeral: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+    current_bytes_immutable: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+    current_bytes_materialized_page: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+});

 pub(crate) mod page_cache_eviction_metrics {
    use std::num::NonZeroUsize;
@@ -337,15 +343,6 @@ pub(crate) mod page_cache_eviction_metrics {
    }
 }

-pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_page_cache_acquire_pinned_slot_seconds",
-        "Time spent acquiring a pinned slot in the page cache",
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric")
-});
-
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "page_cache_errors_total",
@@ -516,18 +513,14 @@ pub(crate) mod initial_logical_size {
    impl StartCalculation {
        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
            let circumstances_label: &'static str = circumstances.into();
-            self.0
-                .with_label_values(&["first", circumstances_label])
-                .inc();
+            self.0.with_label_values(&["first", circumstances_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
            }
        }
        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
            let circumstances_label: &'static str = circumstances.into();
-            self.0
-                .with_label_values(&["retry", circumstances_label])
-                .inc();
+            self.0.with_label_values(&["retry", circumstances_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
            }
@@ -734,13 +727,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {

 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
-pub(crate) struct EvictionsWithLowResidenceDuration {
+pub struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
    threshold: Duration,
    counter: Option<IntCounter>,
 }

-pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
+pub struct EvictionsWithLowResidenceDurationBuilder {
    data_source: &'static str,
    threshold: Duration,
 }
@@ -1003,7 +996,7 @@ pub enum SmgrQueryType {
 }

 #[derive(Debug)]
-pub(crate) struct SmgrQueryTimePerTimeline {
+pub struct SmgrQueryTimePerTimeline {
    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }

@@ -1017,62 +1010,12 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
-    [
-        1,
-        10,
-        20,
-        40,
-        60,
-        80,
-        100,
-        200,
-        300,
-        400,
-        500,
-        600,
-        700,
-        800,
-        900,
-        1_000, // 1ms
-        2_000,
-        4_000,
-        6_000,
-        8_000,
-        10_000, // 10ms
-        20_000,
-        40_000,
-        60_000,
-        80_000,
-        100_000,
-        200_000,
-        400_000,
-        600_000,
-        800_000,
-        1_000_000, // 1s
-        2_000_000,
-        4_000_000,
-        6_000_000,
-        8_000_000,
-        10_000_000, // 10s
-        20_000_000,
-        50_000_000,
-        100_000_000,
-        200_000_000,
-        1_000_000_000, // 1000s
-    ]
-    .into_iter()
-    .map(Duration::from_micros)
-    .map(|d| d.as_secs_f64())
-    .collect()
-});
-
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds_global",
        "Time spent on smgr query handling, aggregated by query type.",
        &["smgr_query_type"],
-        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -1175,8 +1118,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
    .map(|ms| (ms as f64) / 1000.0)
 });

-pub(crate) struct BasebackupQueryTime(HistogramVec);
-pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+pub struct BasebackupQueryTime(HistogramVec);
+pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
        register_histogram_vec!(
            "pageserver_basebackup_query_seconds",
@@ -1196,7 +1139,7 @@ impl DurationResultObserver for BasebackupQueryTime {
    }
 }

-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
        "Number of live network connections",
@@ -1363,8 +1306,6 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) upload_heatmap: IntCounter,
    pub(crate) upload_heatmap_errors: IntCounter,
    pub(crate) upload_heatmap_duration: Histogram,
-    pub(crate) download_heatmap: IntCounter,
-    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
@@ -1382,16 +1323,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
        "Time to build and upload a heatmap, including any waiting inside the S3 client"
    )
    .expect("failed to define a metric"),
-    download_heatmap: register_int_counter!(
-        "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
-    )
-    .expect("failed to define a metric"),
-    download_layer: register_int_counter!(
-        "pageserver_secondary_download_layer",
-        "Number of downloads of layers by secondary mode locations"
-    )
-    .expect("failed to define a metric"),
 });

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1661,7 +1592,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
-pub(crate) struct StorageTimeMetricsTimer {
+pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
 }
@@ -1686,7 +1617,7 @@ impl StorageTimeMetricsTimer {
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]
-pub(crate) struct StorageTimeMetrics {
+pub struct StorageTimeMetrics {
    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
    timeline_sum: Counter,
    /// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1725,7 +1656,7 @@ impl StorageTimeMetrics {
 }

 #[derive(Debug)]
-pub(crate) struct TimelineMetrics {
+pub struct TimelineMetrics {
    tenant_id: String,
    shard_id: String,
    timeline_id: String,
@@ -1933,7 +1864,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
    }
 }

-pub(crate) struct RemoteTimelineClientMetrics {
+pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
@@ -2231,7 +2162,7 @@ impl Drop for RemoteTimelineClientMetrics {

 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub(crate) trait MeasureRemoteOp: Sized {
+pub trait MeasureRemoteOp: Sized {
    fn measure_remote_op(
        self,
        tenant_id: TenantId,
@@ -2256,7 +2187,7 @@ pub(crate) trait MeasureRemoteOp: Sized {
 impl<T: Sized> MeasureRemoteOp for T {}

 pin_project! {
-    pub(crate) struct MeasuredRemoteOp<F>
+    pub struct MeasuredRemoteOp<F>
    {
        #[pin]
        inner: F,
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -550,7 +550,6 @@ impl PageCache {
    // not require changes.

    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
-        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
        match tokio::time::timeout(
            // Choose small timeout, neon_smgr does its own retries.
            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
@@ -563,7 +562,6 @@ impl PageCache {
                res.expect("this semaphore is never closed"),
            )),
            Err(_timeout) => {
-                timer.stop_and_discard();
                crate::metrics::page_cache_errors_inc(
                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
                );
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -25,7 +25,6 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
-use std::borrow::Cow;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -54,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::{rel_block_to_key, Version};
+use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -62,9 +61,6 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
-use crate::tenant::timeline::WaitLsnError;
-use crate::tenant::GetTimelineError;
-use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

@@ -287,64 +283,6 @@ struct PageServerHandler {
    connection_ctx: RequestContext,
 }

-#[derive(thiserror::Error, Debug)]
-enum PageStreamError {
-    /// We encountered an error that should prompt the client to reconnect:
-    /// in practice this means we drop the connection without sending a response.
-    #[error("Reconnect required: {0}")]
-    Reconnect(Cow<'static, str>),
-
-    /// We were instructed to shutdown while processing the query
-    #[error("Shutting down")]
-    Shutdown,
-
-    /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error: {0}")]
-    Read(PageReconstructError),
-
-    /// Ran out of time waiting for an LSN
-    #[error("LSN timeout: {0}")]
-    LsnTimeout(WaitLsnError),
-
-    /// The entity required to serve the request (tenant or timeline) is not found,
-    /// or is not found in a suitable state to serve a request.
-    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
-
-    /// Request asked for something that doesn't make sense, like an invalid LSN
-    #[error("Bad request: {0}")]
-    BadRequest(std::borrow::Cow<'static, str>),
-}
-
-impl From<PageReconstructError> for PageStreamError {
-    fn from(value: PageReconstructError) -> Self {
-        match value {
-            PageReconstructError::Cancelled => Self::Shutdown,
-            e => Self::Read(e),
-        }
-    }
-}
-
-impl From<GetActiveTimelineError> for PageStreamError {
-    fn from(value: GetActiveTimelineError) -> Self {
-        match value {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
-            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
-            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
-        }
-    }
-}
-
-impl From<WaitLsnError> for PageStreamError {
-    fn from(value: WaitLsnError) -> Self {
-        match value {
-            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
-            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
-        }
-    }
-}
-
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
@@ -490,7 +428,7 @@ impl PageServerHandler {
        // Check that the timeline exists
        let timeline = tenant
            .get_timeline(timeline_id, true)
-            .map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
+            .map_err(|e| anyhow::anyhow!(e))?;

        // Avoid starting new requests if the timeline has already started shutting down,
        // and block timeline shutdown until this request is complete, or drops out due
@@ -582,44 +520,32 @@ impl PageServerHandler {
                }
            };

-            match response {
-                Err(PageStreamError::Shutdown) => {
+            if let Err(e) = &response {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
-                    span.in_scope(|| info!("dropping connection due to shutdown"));
+                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
                    return Err(QueryError::Shutdown);
                }
-                Err(PageStreamError::Reconnect(reason)) => {
-                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                    return Err(QueryError::Reconnect);
-                }
-                Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
-                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
-                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
-                    //
-                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                    // because wait_lsn etc will drop out
-                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                    // is_canceled(): [`Timeline::shutdown`]` has entered
-                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-                r => {
-                    let response_msg = r.unwrap_or_else(|e| {
-                        // print the all details to the log with {:#}, but for the client the
-                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                        // here includes cancellation which is not an error.
-                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
-                    });
-
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-                    self.flush_cancellable(pgb, &timeline.cancel).await?;
-                }
            }
+
+            let response = response.unwrap_or_else(|e| {
+                // print the all details to the log with {:#}, but for the client the
+                // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                // here includes cancellation which is not an error.
+                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: e.to_string(),
+                })
+            });
+
+            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
+            self.flush_cancellable(pgb, &timeline.cancel).await?;
        }
        Ok(())
    }
@@ -766,7 +692,7 @@ impl PageServerHandler {
        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
-    ) -> Result<Lsn, PageStreamError> {
+    ) -> anyhow::Result<Lsn> {
        if latest {
            // Latest page version was requested. If LSN is given, it is a hint
            // to the page server that there have been no modifications to the
@@ -797,19 +723,15 @@ impl PageServerHandler {
            }
        } else {
            if lsn == Lsn(0) {
-                return Err(PageStreamError::BadRequest(
-                    "invalid LSN(0) in request".into(),
-                ));
+                anyhow::bail!("invalid LSN(0) in request");
            }
            timeline.wait_lsn(lsn, ctx).await?;
        }
-
-        if lsn < **latest_gc_cutoff_lsn {
-            return Err(PageStreamError::BadRequest(format!(
-                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                lsn, **latest_gc_cutoff_lsn
-            ).into()));
-        }
+        anyhow::ensure!(
+            lsn >= **latest_gc_cutoff_lsn,
+            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+            lsn, **latest_gc_cutoff_lsn
+        );
        Ok(lsn)
    }

@@ -818,14 +740,14 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamExistsRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_exists(req.rel, lsn, req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -838,15 +760,13 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamNblocksRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

-        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
-            .await?;
+        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
            n_blocks,
@@ -858,20 +778,14 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamDbSizeRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let total_blocks = timeline
-            .get_db_size(
-                DEFAULTTABLESPACE_OID,
-                req.dbnode,
-                Version::Lsn(lsn),
-                req.latest,
-                ctx,
-            )
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -880,35 +794,30 @@ impl PageServerHandler {
        }))
    }

-    async fn do_handle_get_page_at_lsn_request(
-        &self,
-        timeline: &Timeline,
-        req: &PagestreamGetPageRequest,
-        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
-        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
-            .await?;
-
-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
-        }))
-    }
-
    async fn handle_get_page_at_lsn_request(
        &self,
        timeline: &Timeline,
        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+        /*
+        // Add a 1s delay to some requests. The delay helps the requests to
+        // hit the race condition from github issue #1047 more easily.
+        use rand::Rng;
+        if rand::thread_rng().gen::<u8>() < 5 {
+            std::thread::sleep(std::time::Duration::from_millis(1000));
+        }
+        */
+
        let key = rel_block_to_key(req.rel, req.blkno);
-        if timeline.get_shard_identity().is_key_local(&key) {
-            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
-                .await
+        let page = if timeline.get_shard_identity().is_key_local(&key) {
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
        } else {
            // The Tenant shard we looked up at connection start does not hold this particular
            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
@@ -927,30 +836,30 @@ impl PageServerHandler {
                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
                    // We already know this tenant exists in general, because we resolved it at
                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                    // mapping is out of date.
-                    tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
-                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
-                    // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                    // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                    // and talk to a different pageserver.
-                    return Err(PageStreamError::Reconnect(
-                        "getpage@lsn request routed to wrong shard".into(),
-                    ));
+                    // the requested page is not present on this node.
+
+                    // TODO: this should be some kind of structured error that the client will understand,
+                    // so that it can block until its config is updated: this error is expected in the case
+                    // that the Tenant's shards' placements are being updated and the client hasn't been
+                    // informed yet.
+                    //
+                    // https://github.com/neondatabase/neon/issues/6038
+                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
                }
                Err(e) => return Err(e.into()),
            };

            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline
-                .gate
-                .enter()
-                .map_err(|_| PageStreamError::Shutdown)?;
+            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
+        };

-            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
-                .await
-        }
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
+        }))
    }

    #[allow(clippy::too_many_arguments)]
@@ -1091,7 +1000,9 @@ impl PageServerHandler {
        )
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
        Ok(timeline)
    }
 }
@@ -1513,15 +1424,14 @@ enum GetActiveTimelineError {
    #[error(transparent)]
    Tenant(GetActiveTenantError),
    #[error(transparent)]
-    Timeline(#[from] GetTimelineError),
+    Timeline(anyhow::Error),
 }

 impl From<GetActiveTimelineError> for QueryError {
    fn from(e: GetActiveTimelineError) -> Self {
        match e {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
            GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
        }
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{ensure, Context};
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,7 +147,6 @@ impl Timeline {
    {
        DatadirModification {
            tline: self,
-            pending_lsns: Vec::new(),
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
@@ -160,11 +159,11 @@ impl Timeline {
    //------------------------------------------------------------------------------

    /// Look up given page version.
-    pub(crate) async fn get_rel_page_at_lsn(
+    pub async fn get_rel_page_at_lsn(
        &self,
        tag: RelTag,
        blknum: BlockNumber,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
@@ -174,47 +173,55 @@ impl Timeline {
            ));
        }

-        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
-        if blknum >= nblocks {
-            debug!(
-                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag,
-                blknum,
-                version.get_lsn(),
-                nblocks
-            );
-            return Ok(ZERO_PAGE.clone());
-        }
+        // let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
+        // if blknum >= nblocks {
+        //     tracing::info!(
+        //         "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
+        //         tag,
+        //         blknum,
+        //         lsn,
+        //         nblocks
+        //     );
+        //     return Ok(ZERO_PAGE.clone());
+        // } else {
+        //     tracing::info!(
+        //         "read within bounds at {} blk {} at {}, size is {}",
+        //         tag,
+        //         blknum,
+        //         lsn,
+        //         nblocks
+        //     );
+        // }

        let key = rel_block_to_key(tag, blknum);
-        version.get(self, key, ctx).await
+        self.get(key, lsn, ctx).await
    }

    // Get size of a database in blocks
-    pub(crate) async fn get_db_size(
+    pub async fn get_db_size(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<usize, PageReconstructError> {
        let mut total_blocks = 0;

-        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
+        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
    }

    /// Get size of a relation file
-    pub(crate) async fn get_rel_size(
+    pub async fn get_rel_size(
        &self,
        tag: RelTag,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
@@ -224,12 +231,12 @@ impl Timeline {
            ));
        }

-        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
            return Ok(nblocks);
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, latest, ctx).await?
+            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -239,7 +246,8 @@ impl Timeline {
        }

        let key = rel_size_to_key(tag);
-        let mut buf = version.get(self, key, ctx).await?;
+        tracing::info!("rel size cache miss {tag} {lsn} {latest}");
+        let mut buf = self.get(key, lsn, ctx).await?;
        let nblocks = buf.get_u32_le();

        if latest {
@@ -250,16 +258,16 @@ impl Timeline {
            // latest=true, then it can not cause cache corruption, because with latest=true
            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+            self.update_cached_rel_size(tag, lsn, nblocks);
        }
        Ok(nblocks)
    }

    /// Does relation exist?
-    pub(crate) async fn get_rel_exists(
+    pub async fn get_rel_exists(
        &self,
        tag: RelTag,
-        version: Version<'_>,
+        lsn: Lsn,
        _latest: bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
@@ -270,12 +278,12 @@ impl Timeline {
        }

        // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
            return Ok(true);
        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -291,16 +299,16 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn list_rels(
+    pub async fn list_rels(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashSet<RelTag>, PageReconstructError> {
        // fetch directory listing
        let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -319,7 +327,7 @@ impl Timeline {
    }

    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
+    pub async fn get_slru_page_at_lsn(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -332,29 +340,29 @@ impl Timeline {
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_size(
+    pub async fn get_slru_segment_size(
        &self,
        kind: SlruKind,
        segno: u32,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = version.get(self, key, ctx).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
        Ok(buf.get_u32_le())
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_exists(
+    pub async fn get_slru_segment_exists(
        &self,
        kind: SlruKind,
        segno: u32,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        // fetch directory listing
        let key = slru_dir_to_key(kind);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -372,7 +380,7 @@ impl Timeline {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    pub(crate) async fn find_lsn_for_timestamp(
+    pub async fn find_lsn_for_timestamp(
        &self,
        search_timestamp: TimestampTz,
        cancel: &CancellationToken,
@@ -452,7 +460,7 @@ impl Timeline {
    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
    /// with a smaller/larger timestamp.
    ///
-    pub(crate) async fn is_latest_commit_timestamp_ge_than(
+    pub async fn is_latest_commit_timestamp_ge_than(
        &self,
        search_timestamp: TimestampTz,
        probe_lsn: Lsn,
@@ -475,7 +483,7 @@ impl Timeline {
    /// Obtain the possible timestamp range for the given lsn.
    ///
    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub(crate) async fn get_timestamp_for_lsn(
+    pub async fn get_timestamp_for_lsn(
        &self,
        probe_lsn: Lsn,
        ctx: &RequestContext,
@@ -505,11 +513,11 @@ impl Timeline {
        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
    ) -> Result<T, PageReconstructError> {
        for segno in self
-            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
+            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
            .await?
        {
            let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
                .await?;
            for blknum in (0..nblocks).rev() {
                let clog_page = self
@@ -532,36 +540,36 @@ impl Timeline {
    }

    /// Get a list of SLRU segments
-    pub(crate) async fn list_slru_segments(
+    pub async fn list_slru_segments(
        &self,
        kind: SlruKind,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashSet<u32>, PageReconstructError> {
        // fetch directory entry
        let key = slru_dir_to_key(kind);

-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;
        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => Ok(dir.segments),
            Err(e) => Err(PageReconstructError::from(e)),
        }
    }

-    pub(crate) async fn get_relmap_file(
+    pub async fn get_relmap_file(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        let key = relmap_file_key(spcnode, dbnode);

-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;
        Ok(buf)
    }

-    pub(crate) async fn list_dbdirs(
+    pub async fn list_dbdirs(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -575,7 +583,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_twophase_file(
+    pub async fn get_twophase_file(
        &self,
        xid: TransactionId,
        lsn: Lsn,
@@ -586,7 +594,7 @@ impl Timeline {
        Ok(buf)
    }

-    pub(crate) async fn list_twophase_files(
+    pub async fn list_twophase_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -600,7 +608,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_control_file(
+    pub async fn get_control_file(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -608,7 +616,7 @@ impl Timeline {
        self.get(CONTROLFILE_KEY, lsn, ctx).await
    }

-    pub(crate) async fn get_checkpoint(
+    pub async fn get_checkpoint(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -616,7 +624,7 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    pub(crate) async fn list_aux_files(
+    pub async fn list_aux_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -656,10 +664,7 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
-                .await?
-            {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
@@ -699,7 +704,7 @@ impl Timeline {
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
+                .list_rels(spcnode, dbnode, lsn, ctx)
                .await?
                .into_iter()
                .collect();
@@ -806,39 +811,18 @@ pub struct DatadirModification<'a> {
    /// in the state in 'tline' yet.
    pub tline: &'a Timeline,

-    /// Current LSN of the modification
-    lsn: Lsn,
+    /// Lsn assigned by begin_modification
+    pub lsn: Lsn,

    // The modifications are not applied directly to the underlying key-value store.
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
-    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
-    pending_deletions: Vec<(Range<Key>, Lsn)>,
+    pending_updates: HashMap<Key, Value>,
+    pending_deletions: Vec<Range<Key>>,
    pending_nblocks: i64,
 }

 impl<'a> DatadirModification<'a> {
-    /// Get the current lsn
-    pub(crate) fn get_lsn(&self) -> Lsn {
-        self.lsn
-    }
-
-    /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
-            lsn >= self.lsn,
-            "setting an older lsn {} than {} is not allowed",
-            lsn,
-            self.lsn
-        );
-        if lsn > self.lsn {
-            self.pending_lsns.push(self.lsn);
-            self.lsn = lsn;
-        }
-        Ok(())
-    }
-
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -1012,9 +996,11 @@ impl<'a> DatadirModification<'a> {
        dbnode: Oid,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let req_lsn = self.tline.get_last_record_lsn();
+
        let total_blocks = self
            .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
+            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
            .await?;

        // Remove entry from dbdir
@@ -1103,11 +1089,8 @@ impl<'a> DatadirModification<'a> {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        if self
-            .tline
-            .get_rel_exists(rel, Version::Modified(self), true, ctx)
-            .await?
-        {
+        let last_lsn = self.tline.get_last_record_lsn();
+        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
            let size_key = rel_size_to_key(rel);
            // Fetch the old size first
            let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1352,23 +1335,17 @@ impl<'a> DatadirModification<'a> {
        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
-        for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
-                if is_rel_block_key(&key) || is_slru_block_key(key) {
-                    // This bails out on first error without modifying pending_updates.
-                    // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
-                } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
-                }
+        let mut retained_pending_updates = HashMap::new();
+        for (key, value) in self.pending_updates.drain() {
+            if is_rel_block_key(&key) || is_slru_block_key(key) {
+                // This bails out on first error without modifying pending_updates.
+                // That's Ok, cf this function's doc comment.
+                writer.put(key, self.lsn, &value, ctx).await?;
+            } else {
+                retained_pending_updates.insert(key, value);
            }
        }
-
-        self.pending_updates = retained_pending_updates;
+        self.pending_updates.extend(retained_pending_updates);

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1385,28 +1362,18 @@ impl<'a> DatadirModification<'a> {
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let writer = self.tline.writer().await;
-
+        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

-        if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
-            self.pending_updates.clear();
+        for (key, value) in self.pending_updates.drain() {
+            writer.put(key, lsn, &value, ctx).await?;
+        }
+        for key_range in self.pending_deletions.drain(..) {
+            writer.delete(key_range, lsn).await?;
        }

-        if !self.pending_deletions.is_empty() {
-            writer.delete_batch(&self.pending_deletions).await?;
-            self.pending_deletions.clear();
-        }
-
-        self.pending_lsns.push(self.lsn);
-        for pending_lsn in self.pending_lsns.drain(..) {
-            // Ideally, we should be able to call writer.finish_write() only once
-            // with the highest LSN. However, the last_record_lsn variable in the
-            // timeline keeps track of the latest LSN and the immediate previous LSN
-            // so we need to record every LSN to not leave a gap between them.
-            writer.finish_write(pending_lsn);
-        }
+        writer.finish_write(lsn);

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1415,86 +1382,44 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub(crate) fn len(&self) -> usize {
-        self.pending_updates.len() + self.pending_deletions.len()
+    pub(crate) fn is_empty(&self) -> bool {
+        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
    }

    // Internal helper functions to batch the modifications

    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the latest pending updated
+        // Have we already updated the same key? Read the pending updated
        // version in that case.
        //
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
-                return if let Value::Image(img) = value {
-                    Ok(img.clone())
-                } else {
-                    // Currently, we never need to read back a WAL record that we
-                    // inserted in the same "transaction". All the metadata updates
-                    // work directly with Images, and we never need to read actual
-                    // data pages. We could handle this if we had to, by calling
-                    // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::from(anyhow::anyhow!(
-                        "unexpected pending WAL record"
-                    )))
-                };
+        if let Some(value) = self.pending_updates.get(&key) {
+            if let Value::Image(img) = value {
+                Ok(img.clone())
+            } else {
+                // Currently, we never need to read back a WAL record that we
+                // inserted in the same "transaction". All the metadata updates
+                // work directly with Images, and we never need to read actual
+                // data pages. We could handle this if we had to, by calling
+                // the walredo manager, but let's keep it simple for now.
+                Err(PageReconstructError::from(anyhow::anyhow!(
+                    "unexpected pending WAL record"
+                )))
            }
+        } else {
+            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+            self.tline.get(key, lsn, ctx).await
        }
-        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-        self.tline.get(key, lsn, ctx).await
    }

    fn put(&mut self, key: Key, val: Value) {
-        let values = self.pending_updates.entry(key).or_default();
-        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
-            if *last_lsn == self.lsn {
-                *last_value = val;
-                return;
-            }
-        }
-        values.push((self.lsn, val));
+        self.pending_updates.insert(key, val);
    }

    fn delete(&mut self, key_range: Range<Key>) {
        trace!("DELETE {}-{}", key_range.start, key_range.end);
-        self.pending_deletions.push((key_range, self.lsn));
-    }
-}
-
-/// This struct facilitates accessing either a committed key from the timeline at a
-/// specific LSN, or the latest uncommitted key from a pending modification.
-/// During WAL ingestion, the records from multiple LSNs may be batched in the same
-/// modification before being flushed to the timeline. Hence, the routines in WalIngest
-/// need to look up the keys in the modification first before looking them up in the
-/// timeline to not miss the latest updates.
-#[derive(Clone, Copy)]
-pub enum Version<'a> {
-    Lsn(Lsn),
-    Modified(&'a DatadirModification<'a>),
-}
-
-impl<'a> Version<'a> {
-    async fn get(
-        &self,
-        timeline: &Timeline,
-        key: Key,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        match self {
-            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
-            Version::Modified(modification) => modification.get(key, ctx).await,
-        }
-    }
-
-    fn get_lsn(&self) -> Lsn {
-        match self {
-            Version::Lsn(lsn) => *lsn,
-            Version::Modified(modification) => modification.lsn,
-        }
+        self.pending_deletions.push(key_range);
    }
 }

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
    // else, but that has not been needed in a long time.
    std::env::var("TOKIO_WORKER_THREADS")
        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
 });

 #[derive(Debug, Clone, Copy)]
@@ -258,9 +258,6 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

-    /// See [`crate::tenant::secondary`].
-    SecondaryDownloads,
-
    /// See [`crate::tenant::secondary`].
    SecondaryUploads,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,7 +33,6 @@ use tracing::*;
 use utils::backoff;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::failpoint_support;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
@@ -56,7 +55,6 @@ use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
-use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -596,9 +594,10 @@ impl Tenant {
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
+        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id,
+            tenant_shard_id.tenant_id,
        )));

        let TenantSharedResources {
@@ -891,7 +890,7 @@ impl Tenant {
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

-        failpoint_support::sleep_millis_async!("before-attaching-tenant");
+        crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");

        let preload = match preload {
            Some(p) => p,
@@ -1003,7 +1002,7 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        failpoint_support::sleep_millis_async!("attach-before-activate");
+        crate::failpoint_support::sleep_millis_async!("attach-before-activate");

        info!("Done");

@@ -1145,9 +1144,10 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        reason: String,
    ) -> Arc<Tenant> {
+        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id,
+            tenant_shard_id.tenant_id,
        )));
        Arc::new(Tenant::new(
            TenantState::Broken {
@@ -1759,15 +1759,7 @@ impl Tenant {
                    // decoding the new WAL might need to look up previous pages, relation
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
-                    ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
-                        .await
-                        .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
-                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
-                            }
-                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
-                        })?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                }

                self.branch_timeline(
@@ -2847,7 +2839,9 @@ impl Tenant {
            }
        };

-        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+        crate::failpoint_support::sleep_millis_async!(
+            "gc_iteration_internal_after_getting_gc_timelines"
+        );

        // If there is nothing to GC, we don't want any messages in the INFO log.
        if !gc_timelines.is_empty() {
@@ -3140,7 +3134,6 @@ impl Tenant {

    /// For unit tests, make this visible so that other modules can directly create timelines
    #[cfg(test)]
-    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
    pub(crate) async fn bootstrap_timeline_test(
        &self,
        timeline_id: TimelineId,
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -22,6 +22,8 @@ pub trait BlockReader {
    /// A cursor caches the last accessed page, allowing for faster
    /// access if the same block is accessed repeatedly.
    fn block_cursor(&self) -> BlockCursor<'_>;
+
+    fn block_cursor_direct(&self) -> BlockCursor<'_>;
 }

 impl<B> BlockReader for &B
@@ -31,12 +33,17 @@ where
    fn block_cursor(&self) -> BlockCursor<'_> {
        (*self).block_cursor()
    }
+
+    fn block_cursor_direct(&self) -> BlockCursor<'_> {
+        (*self).block_cursor()
+    }
 }

 /// Reference to an in-memory copy of an immutable on-disk block.
 pub enum BlockLease<'a> {
    PageReadGuard(PageReadGuard<'static>),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
+    Direct(bytes::Bytes),
    #[cfg(test)]
    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
 }
@@ -61,6 +68,7 @@ impl<'a> Deref for BlockLease<'a> {
        match self {
            BlockLease::PageReadGuard(v) => v.deref(),
            BlockLease::EphemeralFileMutableTail(v) => v,
+            BlockLease::Direct(b) => <&[u8; PAGE_SZ]>::try_from(b as &[u8]).unwrap(),
            #[cfg(test)]
            BlockLease::Arc(v) => v.deref(),
        }
@@ -99,6 +107,24 @@ impl<'a> BlockReaderRef<'a> {
            VirtualFile(r) => r.read_blk(blknum).await,
        }
    }
+
+    #[inline(always)]
+    async fn read_blk_direct(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
+        use BlockReaderRef::*;
+        match self {
+            FileBlockReader(r) => r.read_blk_direct(blknum, ctx).await,
+            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
+            Adapter(r) => r.read_blk(blknum, ctx).await,
+            #[cfg(test)]
+            TestDisk(r) => r.read_blk(blknum),
+            #[cfg(test)]
+            VirtualFile(r) => r.read_blk(blknum).await,
+        }
+    }
 }

 ///
@@ -121,17 +147,28 @@ impl<'a> BlockReaderRef<'a> {
 /// ```
 ///
 pub struct BlockCursor<'a> {
+    direct: bool,
    reader: BlockReaderRef<'a>,
 }

 impl<'a> BlockCursor<'a> {
    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
-        BlockCursor { reader }
+        BlockCursor {
+            reader,
+            direct: false,
+        }
+    }
+    pub(crate) fn new_direct(reader: BlockReaderRef<'a>) -> Self {
+        BlockCursor {
+            reader,
+            direct: true,
+        }
    }
    // Needed by cli
    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
        BlockCursor {
            reader: BlockReaderRef::FileBlockReader(reader),
+            direct: false,
        }
    }

@@ -146,7 +183,11 @@ impl<'a> BlockCursor<'a> {
        blknum: u32,
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
-        self.reader.read_blk(blknum, ctx).await
+        if self.direct {
+            self.reader.read_blk_direct(blknum, ctx).await
+        } else {
+            self.reader.read_blk(blknum, ctx).await
+        }
    }
 }

@@ -203,12 +244,27 @@ impl FileBlockReader {
            }
        }
    }
+
+    pub async fn read_blk_direct(
+        &self,
+        blknum: u32,
+        _ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
+        let mut buf = bytes::BytesMut::zeroed(PAGE_SZ);
+        let buffer = <&mut [u8; PAGE_SZ]>::try_from(&mut buf as &mut [u8]).unwrap();
+        self.fill_buffer(buffer, blknum).await?;
+        Ok(BlockLease::Direct(buf.into()))
+    }
 }

 impl BlockReader for FileBlockReader {
    fn block_cursor(&self) -> BlockCursor<'_> {
        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
    }
+
+    fn block_cursor_direct(&self) -> BlockCursor<'_> {
+        BlockCursor::new_direct(BlockReaderRef::FileBlockReader(self))
+    }
 }

 ///
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -46,8 +46,6 @@ pub mod defaults {
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -48,9 +48,6 @@ pub(crate) enum DeleteTenantError {
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),

-    #[error("Cancelled")]
-    Cancelled,
-
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -588,7 +585,7 @@ impl DeleteTenantFlow {
                            }
                            break;
                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
                            // This is unexpected: this secondary tenants should not have been created, and we
                            // are not in a position to shut it down from here.
                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -266,6 +266,10 @@ impl BlockReader for EphemeralFile {
    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
    }
+
+    fn block_cursor_direct(&self) -> super::block_io::BlockCursor<'_> {
+        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,7 +44,6 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
-use super::secondary::SecondaryTenant;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -58,7 +57,7 @@ use super::TenantSharedResources;
 /// having a properly acquired generation (Secondary doesn't need a generation)
 pub(crate) enum TenantSlot {
    Attached(Arc<Tenant>),
-    Secondary(Arc<SecondaryTenant>),
+    Secondary,
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
    InProgress(utils::completion::Barrier),
@@ -68,7 +67,7 @@ impl std::fmt::Debug for TenantSlot {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
-            Self::Secondary(_) => write!(f, "Secondary"),
+            Self::Secondary => write!(f, "Secondary"),
            Self::InProgress(_) => write!(f, "InProgress"),
        }
    }
@@ -79,7 +78,7 @@ impl TenantSlot {
    fn get_attached(&self) -> Option<&Arc<Tenant>> {
        match self {
            Self::Attached(t) => Some(t),
-            Self::Secondary(_) => None,
+            Self::Secondary => None,
            Self::InProgress(_) => None,
        }
    }
@@ -131,7 +130,7 @@ impl TenantsMap {

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_attached_shard(
+    fn resolve_shard(
        &self,
        tenant_id: &TenantId,
        selector: ShardSelector,
@@ -141,27 +140,25 @@ impl TenantsMap {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    // Ignore all slots that don't contain an attached tenant
-                    let tenant = match &slot.1 {
-                        TenantSlot::Attached(t) => t,
-                        _ => continue,
-                    };
-
                    match selector {
                        ShardSelector::First => return Some(*slot.0),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return Some(*slot.0)
                        }
                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                            }
+                            if let Some(tenant) = slot.1.get_attached() {
+                                // First slot we see for this tenant, calculate the expected shard number
+                                // for the key: we will use this for checking if this and subsequent
+                                // slots contain the key, rather than recalculating the hash each time.
+                                if want_shard.is_none() {
+                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                                }

-                            if Some(tenant.shard_identity.number) == want_shard {
-                                return Some(*slot.0);
+                                if Some(tenant.shard_identity.number) == want_shard {
+                                    return Some(*slot.0);
+                                }
+                            } else {
+                                continue;
                            }
                        }
                        _ => continue,
@@ -467,18 +464,12 @@ pub async fn init_tenant_mgr(
                *gen
            } else {
                match &location_conf.mode {
-                    LocationMode::Secondary(secondary_config) => {
+                    LocationMode::Secondary(_) => {
                        // We do not require the control plane's permission for secondary mode
                        // tenants, because they do no remote writes and hence require no
                        // generation number
                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                secondary_config,
-                            )),
-                        );
+                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
                    }
                    LocationMode::Attached(_) => {
                        // TODO: augment re-attach API to enable the control plane to
@@ -523,7 +514,10 @@ pub async fn init_tenant_mgr(
            &ctx,
        ) {
            Ok(tenant) => {
-                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
+                tenants.insert(
+                    TenantShardId::unsharded(tenant.tenant_id()),
+                    TenantSlot::Attached(tenant),
+                );
            }
            Err(e) => {
                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
@@ -670,14 +664,8 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {

                            total_attached += 1;
                        }
-                        TenantSlot::Secondary(state) => {
-                            // We don't need to wait for this individually per-tenant: the
-                            // downloader task will be waited on eventually, this cancel
-                            // is just to encourage it to drop out if it is doing work
-                            // for this tenant right now.
-                            state.cancel.cancel();
-
-                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
+                        TenantSlot::Secondary => {
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
                        }
                        TenantSlot::InProgress(notify) => {
                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -860,28 +848,12 @@ impl TenantManager {
            Some(TenantSlot::InProgress(_)) => {
                Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
            }
-            None | Some(TenantSlot::Secondary(_)) => {
+            None | Some(TenantSlot::Secondary) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
            }
        }
    }

-    pub(crate) fn get_secondary_tenant_shard(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Option<Arc<SecondaryTenant>> {
-        let locked = self.tenants.read().unwrap();
-
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-            .ok()
-            .flatten();
-
-        match peek_slot {
-            Some(TenantSlot::Secondary(s)) => Some(s.clone()),
-            _ => None,
-        }
-    }
-
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
@@ -893,15 +865,10 @@ impl TenantManager {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

-        enum FastPathModified {
-            Attached(Arc<Tenant>),
-            Secondary(Arc<SecondaryTenant>),
-        }
-
-        // Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
+        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
        // then we do not need to set the slot to InProgress, we can just call into the
        // existng tenant.
-        let fast_path_taken = {
+        let modify_tenant = {
            let locked = self.tenants.read().unwrap();
            let peek_slot =
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -915,19 +882,12 @@ impl TenantManager {
                            new_location_config.clone(),
                        )?);

-                        Some(FastPathModified::Attached(tenant.clone()))
+                        Some(tenant.clone())
                    } else {
                        // Different generations, fall through to general case
                        None
                    }
                }
-                (
-                    LocationMode::Secondary(secondary_conf),
-                    Some(TenantSlot::Secondary(secondary_tenant)),
-                ) => {
-                    secondary_tenant.set_config(secondary_conf);
-                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
-                }
                _ => {
                    // Not an Attached->Attached transition, fall through to general case
                    None
@@ -936,51 +896,34 @@ impl TenantManager {
        };

        // Fast-path continued: having dropped out of the self.tenants lock, do the async
-        // phase of writing config and/or waiting for flush, before returning.
-        match fast_path_taken {
-            Some(FastPathModified::Attached(tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
-                // Transition to AttachedStale means we may well hold a valid generation
-                // still, and have been requested to go stale as part of a migration.  If
-                // the caller set `flush`, then flush to remote storage.
-                if let LocationMode::Attached(AttachedLocationConfig {
-                    generation: _,
-                    attach_mode: AttachmentMode::Stale,
-                }) = &new_location_config.mode
-                {
-                    if let Some(flush_timeout) = flush {
-                        match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
-                            Ok(Err(e)) => {
-                                return Err(e);
-                            }
-                            Ok(Ok(_)) => return Ok(()),
-                            Err(_) => {
-                                tracing::warn!(
+        // phase of waiting for flush, before returning.
+        if let Some(tenant) = modify_tenant {
+            // Transition to AttachedStale means we may well hold a valid generation
+            // still, and have been requested to go stale as part of a migration.  If
+            // the caller set `flush`, then flush to remote storage.
+            if let LocationMode::Attached(AttachedLocationConfig {
+                generation: _,
+                attach_mode: AttachmentMode::Stale,
+            }) = &new_location_config.mode
+            {
+                if let Some(flush_timeout) = flush {
+                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                        Ok(Err(e)) => {
+                            return Err(e);
+                        }
+                        Ok(Ok(_)) => return Ok(()),
+                        Err(_) => {
+                            tracing::warn!(
                                timeout_ms = flush_timeout.as_millis(),
                                "Timed out waiting for flush to remote storage, proceeding anyway."
                            )
-                            }
                        }
                    }
                }
+            }

-                return Ok(());
-            }
-            Some(FastPathModified::Secondary(_secondary_tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
-                return Ok(());
-            }
-            None => {
-                // Proceed with the general case procedure, where we will shutdown & remove any existing
-                // slot contents and replace with a fresh one
-            }
-        };
+            return Ok(());
+        }

        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
        // InProgress value to the slot while we make whatever changes are required.  The state for
@@ -989,73 +932,65 @@ impl TenantManager {
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;

-        match slot_guard.get_old_value() {
-            Some(TenantSlot::Attached(tenant)) => {
-                // The case where we keep a Tenant alive was covered above in the special case
-                // for Attached->Attached transitions in the same generation.  By this point,
-                // if we see an attached tenant we know it will be discarded and should be
-                // shut down.
-                let (_guard, progress) = utils::completion::channel();
+        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
+            // The case where we keep a Tenant alive was covered above in the special case
+            // for Attached->Attached transitions in the same generation.  By this point,
+            // if we see an attached tenant we know it will be discarded and should be
+            // shut down.
+            let (_guard, progress) = utils::completion::channel();

-                match tenant.get_attach_mode() {
-                    AttachmentMode::Single | AttachmentMode::Multi => {
-                        // Before we leave our state as the presumed holder of the latest generation,
-                        // flush any outstanding deletions to reduce the risk of leaking objects.
-                        self.resources.deletion_queue_client.flush_advisory()
-                    }
-                    AttachmentMode::Stale => {
-                        // If we're stale there's not point trying to flush deletions
-                    }
-                };
-
-                info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, false).await {
-                    Ok(()) => {}
-                    Err(barrier) => {
-                        info!("Shutdown already in progress, waiting for it to complete");
-                        barrier.wait().await;
-                    }
+            match tenant.get_attach_mode() {
+                AttachmentMode::Single | AttachmentMode::Multi => {
+                    // Before we leave our state as the presumed holder of the latest generation,
+                    // flush any outstanding deletions to reduce the risk of leaking objects.
+                    self.resources.deletion_queue_client.flush_advisory()
+                }
+                AttachmentMode::Stale => {
+                    // If we're stale there's not point trying to flush deletions
+                }
+            };
+
+            info!("Shutting down attached tenant");
+            match tenant.shutdown(progress, false).await {
+                Ok(()) => {}
+                Err(barrier) => {
+                    info!("Shutdown already in progress, waiting for it to complete");
+                    barrier.wait().await;
                }
-                slot_guard.drop_old_value().expect("We just shut it down");
-            }
-            Some(TenantSlot::Secondary(state)) => {
-                info!("Shutting down secondary tenant");
-                state.shutdown().await;
-            }
-            Some(TenantSlot::InProgress(_)) => {
-                // This should never happen: acquire_slot should error out
-                // if the contents of a slot were InProgress.
-                anyhow::bail!("Acquired an InProgress slot, this is a bug.")
-            }
-            None => {
-                // Slot was vacant, nothing needs shutting down.
            }
+            slot_guard.drop_old_value().expect("We just shut it down");
        }

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-
-        // Directory structure is the same for attached and secondary modes:
-        // create it if it doesn't exist.  Timeline load/creation expects the
-        // timelines/ subdir to already exist.
-        //
-        // Does not need to be fsync'd because local storage is just a cache.
-        tokio::fs::create_dir_all(&timelines_path)
-            .await
-            .with_context(|| format!("Creating {timelines_path}"))?;
-
-        // Before activating either secondary or attached mode, persist the
-        // configuration, so that on restart we will re-attach (or re-start
-        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-            .await
-            .map_err(SetNewTenantConfigError::Persist)?;

        let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(secondary_config) => {
-                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
+            LocationMode::Secondary(_) => {
+                // Directory doesn't need to be fsync'd because if we crash it can
+                // safely be recreated next time this tenant location is configured.
+                tokio::fs::create_dir_all(&tenant_path)
+                    .await
+                    .with_context(|| format!("Creating {tenant_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                TenantSlot::Secondary
            }
            LocationMode::Attached(_attach_config) => {
+                let timelines_path = self.conf.timelines_path(&tenant_shard_id);
+
+                // Directory doesn't need to be fsync'd because we do not depend on
+                // it to exist after crashes: it may be recreated when tenant is
+                // re-attached, see https://github.com/neondatabase/neon/issues/5550
+                tokio::fs::create_dir_all(&tenant_path)
+                    .await
+                    .with_context(|| format!("Creating {timelines_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
                let shard_identity = new_location_config.shard;
                let tenant = tenant_spawn(
                    self.conf,
@@ -1167,95 +1102,6 @@ impl TenantManager {
                .collect(),
        }
    }
-    // Do some synchronous work for all tenant slots in Secondary state.  The provided
-    // callback should be small and fast, as it will be called inside the global
-    // TenantsMap lock.
-    pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
-    where
-        // TODO: let the callback return a hint to drop out of the loop early
-        F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
-    {
-        let locked = self.tenants.read().unwrap();
-
-        let map = match &*locked {
-            TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
-            TenantsMap::Open(m) => m,
-        };
-
-        for (tenant_id, slot) in map {
-            if let TenantSlot::Secondary(state) = slot {
-                // Only expose secondary tenants that are not currently shutting down
-                if !state.cancel.is_cancelled() {
-                    func(tenant_id, state)
-                }
-            }
-        }
-    }
-
-    pub(crate) async fn delete_tenant(
-        &self,
-        tenant_shard_id: TenantShardId,
-        activation_timeout: Duration,
-    ) -> Result<(), DeleteTenantError> {
-        // We acquire a SlotGuard during this function to protect against concurrent
-        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-        // have to return the Tenant to the map while the background deletion runs.
-        //
-        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-        // Currently, deletion requires a reference to the tenants map in order to
-        // keep the Tenant in the map until deletion is complete, and then remove
-        // it at the end.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080
-
-        let slot_guard =
-            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
-
-        // unwrap is safe because we used MustExist mode when acquiring
-        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
-            TenantSlot::Attached(tenant) => tenant.clone(),
-            _ => {
-                // Express "not attached" as equivalent to "not found"
-                return Err(DeleteTenantError::NotAttached);
-            }
-        };
-
-        match tenant.current_state() {
-            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
-            }
-            _ => {
-                tenant
-                    .wait_to_become_active(activation_timeout)
-                    .await
-                    .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_) => {
-                            DeleteTenantError::InvalidState(tenant.current_state())
-                        }
-                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
-                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
-                        GetActiveTenantError::WaitForActiveTimeout {
-                            latest_state: _latest_state,
-                            wait_time: _wait_time,
-                        } => DeleteTenantError::InvalidState(tenant.current_state()),
-                    })?;
-            }
-        }
-
-        let result = DeleteTenantFlow::run(
-            self.conf,
-            self.resources.remote_storage.clone(),
-            &TENANTS,
-            tenant,
-        )
-        .await;
-
-        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
-        slot_guard.revert();
-        result
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1305,7 +1151,7 @@ pub(crate) fn get_tenant(
        Some(TenantSlot::InProgress(_)) => {
            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
        }
-        None | Some(TenantSlot::Secondary(_)) => {
+        None | Some(TenantSlot::Secondary) => {
            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
        }
    }
@@ -1357,11 +1203,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
        let locked = TENANTS.read().unwrap();

        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked
-            .resolve_attached_shard(&tenant_id, shard_selector)
-            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                tenant_id,
-            )))?;
+        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
+            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
+        )?;

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
@@ -1378,7 +1222,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    }
                }
            }
-            Some(TenantSlot::Secondary(_)) => {
+            Some(TenantSlot::Secondary) => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
                    tenant_id,
                )))
@@ -1435,6 +1279,41 @@ pub(crate) async fn get_active_tenant_with_timeout(
    Ok(tenant)
 }

+pub(crate) async fn delete_tenant(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_shard_id: TenantShardId,
+) -> Result<(), DeleteTenantError> {
+    // We acquire a SlotGuard during this function to protect against concurrent
+    // changes while the ::prepare phase of DeleteTenantFlow executes, but then
+    // have to return the Tenant to the map while the background deletion runs.
+    //
+    // TODO: refactor deletion to happen outside the lifetime of a Tenant.
+    // Currently, deletion requires a reference to the tenants map in order to
+    // keep the Tenant in the map until deletion is complete, and then remove
+    // it at the end.
+    //
+    // See https://github.com/neondatabase/neon/issues/5080
+
+    // TODO(sharding): make delete API sharding-aware
+    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+
+    // unwrap is safe because we used MustExist mode when acquiring
+    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
+        TenantSlot::Attached(tenant) => tenant.clone(),
+        _ => {
+            // Express "not attached" as equivalent to "not found"
+            return Err(DeleteTenantError::NotAttached);
+        }
+    };
+
+    let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
+
+    // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
+    slot_guard.revert();
+    result
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
    #[error("Tenant {0}")]
@@ -1642,7 +1521,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary(_) => None,
+            TenantSlot::Secondary => None,
            TenantSlot::InProgress(_) => None,
        })
        .collect())
@@ -1899,7 +1778,11 @@ impl SlotGuard {
    fn old_value_is_shutdown(&self) -> bool {
        match self.old_value.as_ref() {
            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
-            Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
+            Some(TenantSlot::Secondary) => {
+                // TODO: when adding secondary mode tenants, this will check for shutdown
+                // in the same way that we do for `Tenant` above
+                true
+            }
            Some(TenantSlot::InProgress(_)) => {
                // A SlotGuard cannot be constructed for a slot that was already InProgress
                unreachable!()
@@ -2109,19 +1992,26 @@ where
    let mut slot_guard =
        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

-    // allow pageserver shutdown to await for our completion
-    let (_guard, progress) = completion::channel();
-
    // The SlotGuard allows us to manipulate the Tenant object without fear of some
    // concurrent API request doing something else for the same tenant ID.
    let attached_tenant = match slot_guard.get_old_value() {
-        Some(TenantSlot::Attached(tenant)) => {
+        Some(TenantSlot::Attached(t)) => Some(t),
+        _ => None,
+    };
+
+    // allow pageserver shutdown to await for our completion
+    let (_guard, progress) = completion::channel();
+
+    // If the tenant was attached, shut it down gracefully.  For secondary
+    // locations this part is not necessary
+    match &attached_tenant {
+        Some(attached_tenant) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
            let freeze_and_flush = false;

            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, freeze_and_flush).await {
+            match attached_tenant.shutdown(progress, freeze_and_flush).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2130,19 +2020,11 @@ where
                    return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
                }
            }
-            Some(tenant)
        }
-        Some(TenantSlot::Secondary(secondary_state)) => {
-            tracing::info!("Shutting down in secondary mode");
-            secondary_state.shutdown().await;
-            None
+        None => {
+            // Nothing to wait on when not attached, proceed.
        }
-        Some(TenantSlot::InProgress(_)) => {
-            // Acquiring a slot guarantees its old value was not InProgress
-            unreachable!();
-        }
-        None => None,
-    };
+    }

    match tenant_cleanup
        .await
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -229,7 +229,6 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    TENANT_HEATMAP_BASENAME,
 };

 use utils::id::{TenantId, TimelineId};
@@ -819,25 +818,8 @@ impl RemoteTimelineClient {
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
    ) {
-        // Filter out any layers which were not created by this tenant shard.  These are
-        // layers that originate from some ancestor shard after a split, and may still
-        // be referenced by other shards. We are free to delete them locally and remove
-        // them from our index (and would have already done so when we reach this point
-        // in the code), but we may not delete them remotely.
-        with_metadata.retain(|(name, meta)| {
-            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
-                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
-            if !retain {
-                tracing::debug!(
-                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
-                    meta.shard
-                );
-            }
-            retain
-        });
-
        for (name, meta) in &with_metadata {
            info!(
                "scheduling deletion of layer {}{} (shard {})",
@@ -1742,11 +1724,11 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

+pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
 pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    RemotePath::from_string(&format!(
-        "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
-    ))
-    .expect("Failed to construct path")
+    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
+        .expect("Failed to construct path")
 }

 /// Given the key of an index, parse out the generation part of the name
@@ -2210,6 +2192,15 @@ mod tests {

        let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();

+        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
+        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
+            timeline_path
+                .strip_prefix(&test_state.harness.conf.workdir)
+                .unwrap(),
+        );
+
+        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
+
        let index_path = test_state.harness.remote_fs_dir.join(
            remote_index_path(
                &test_state.harness.tenant_shard_id,
@@ -2218,10 +2209,6 @@ mod tests {
            )
            .get_path(),
        );
-
-        std::fs::create_dir_all(index_path.parent().unwrap())
-            .expect("creating test dir should work");
-
        eprintln!("Writing {index_path}");
        std::fs::write(&index_path, index_part_bytes).unwrap();
        example_index_part
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,48 +1,24 @@
-mod downloader;
 pub mod heatmap;
 mod heatmap_uploader;
-mod scheduler;

 use std::sync::Arc;

 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};

-use self::{
-    downloader::{downloader_task, SecondaryDetail},
-    heatmap_uploader::heatmap_uploader_task,
-};
+use self::heatmap_uploader::heatmap_uploader_task;

-use super::{config::SecondaryLocationConfig, mgr::TenantManager};
+use super::mgr::TenantManager;

 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
-use utils::{completion::Barrier, sync::gate::Gate};
+use utils::completion::Barrier;

-enum DownloadCommand {
-    Download(TenantShardId),
-}
 enum UploadCommand {
    Upload(TenantShardId),
 }

-impl UploadCommand {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        match self {
-            Self::Upload(id) => id,
-        }
-    }
-}
-
-impl DownloadCommand {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        match self {
-            Self::Download(id) => id,
-        }
-    }
-}
-
 struct CommandRequest<T> {
    payload: T,
    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -52,73 +28,12 @@ struct CommandResponse {
    result: anyhow::Result<()>,
 }

-// Whereas [`Tenant`] represents an attached tenant, this type represents the work
-// we do for secondary tenant locations: where we are not serving clients or
-// ingesting WAL, but we are maintaining a warm cache of layer files.
-//
-// This type is all about the _download_ path for secondary mode.  The upload path
-// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
-//
-// This structure coordinates TenantManager and SecondaryDownloader,
-// so that the downloader can indicate which tenants it is currently
-// operating on, and the manager can indicate when a particular
-// secondary tenant should cancel any work in flight.
-#[derive(Debug)]
-pub(crate) struct SecondaryTenant {
-    /// Carrying a tenant shard ID simplifies callers such as the downloader
-    /// which need to organize many of these objects by ID.
-    tenant_shard_id: TenantShardId,
-
-    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
-    /// any work for this tenant at the next opportunity.
-    pub(crate) cancel: CancellationToken,
-
-    pub(crate) gate: Gate,
-
-    detail: std::sync::Mutex<SecondaryDetail>,
-}
-
-impl SecondaryTenant {
-    pub(crate) fn new(
-        tenant_shard_id: TenantShardId,
-        config: &SecondaryLocationConfig,
-    ) -> Arc<Self> {
-        Arc::new(Self {
-            tenant_shard_id,
-            // todo: shall we make this a descendent of the
-            // main cancellation token, or is it sufficient that
-            // on shutdown we walk the tenants and fire their
-            // individual cancellations?
-            cancel: CancellationToken::new(),
-            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
-
-            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
-        })
-    }
-
-    pub(crate) async fn shutdown(&self) {
-        self.cancel.cancel();
-
-        // Wait for any secondary downloader work to complete
-        self.gate.close().await;
-    }
-
-    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
-        self.detail.lock().unwrap().config = config.clone();
-    }
-
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        &self.tenant_shard_id
-    }
-}
-
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
 /// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
 /// where we want to immediately upload/download for a particular tenant.  In normal operation
 /// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
-    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
 }

 impl SecondaryController {
@@ -148,13 +63,6 @@ impl SecondaryController {
        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
            .await
    }
-    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        self.dispatch(
-            &self.download_req_tx,
-            DownloadCommand::Download(tenant_shard_id),
-        )
-        .await
-    }
 }

 pub fn spawn_tasks(
@@ -163,37 +71,9 @@ pub fn spawn_tasks(
    background_jobs_can_start: Barrier,
    cancel: CancellationToken,
 ) -> SecondaryController {
-    let mgr_clone = tenant_manager.clone();
-    let storage_clone = remote_storage.clone();
-    let cancel_clone = cancel.clone();
-    let bg_jobs_clone = background_jobs_can_start.clone();
-
-    let (download_req_tx, download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);

-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryDownloads,
-        None,
-        None,
-        "secondary tenant downloads",
-        false,
-        async move {
-            downloader_task(
-                mgr_clone,
-                storage_clone,
-                download_req_rx,
-                bg_jobs_clone,
-                cancel_clone,
-            )
-            .await;
-
-            Ok(())
-        },
-    );
-
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::SecondaryUploads,
@@ -209,26 +89,16 @@ pub fn spawn_tasks(
                background_jobs_can_start,
                cancel,
            )
-            .await;
-
-            Ok(())
+            .await
        },
    );

-    SecondaryController {
-        download_req_tx,
-        upload_req_tx,
-    }
+    SecondaryController { upload_req_tx }
 }

 /// For running with remote storage disabled: a SecondaryController that is connected to nothing.
 pub fn null_controller() -> SecondaryController {
-    let (download_req_tx, _download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, _upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController {
-        upload_req_tx,
-        download_req_tx,
-    }
+    SecondaryController { upload_req_tx }
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1,801 +0,0 @@
-use std::{
-    collections::{HashMap, HashSet},
-    pin::Pin,
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, Instant, SystemTime},
-};
-
-use crate::{
-    config::PageServerConf,
-    metrics::SECONDARY_MODE,
-    tenant::{
-        config::SecondaryLocationConfig,
-        debug_assert_current_span_has_tenant_and_timeline_id,
-        remote_timeline_client::{
-            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
-        },
-        span::debug_assert_current_span_has_tenant_id,
-        storage_layer::LayerFileName,
-        tasks::{warn_when_period_overrun, BackgroundLoopKind},
-    },
-    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
-};
-
-use super::{
-    heatmap::HeatMapLayer,
-    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
-    SecondaryTenant,
-};
-
-use crate::tenant::{
-    mgr::TenantManager,
-    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
-};
-
-use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
-use pageserver_api::shard::TenantShardId;
-use rand::Rng;
-use remote_storage::{DownloadError, GenericRemoteStorage};
-
-use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
-use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
-};
-
-use super::{
-    heatmap::{HeatMapTenant, HeatMapTimeline},
-    CommandRequest, DownloadCommand,
-};
-
-/// For each tenant, how long must have passed since the last download_tenant call before
-/// calling it again.  This is approximately the time by which local data is allowed
-/// to fall behind remote data.
-///
-/// TODO: this should just be a default, and the actual period should be controlled
-/// via the heatmap itself
-/// `<ttps://github.com/neondatabase/neon/issues/6200>`
-const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
-
-pub(super) async fn downloader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
-
-    let generator = SecondaryDownloader {
-        tenant_manager,
-        remote_storage,
-    };
-    let mut scheduler = Scheduler::new(generator, concurrency);
-
-    scheduler
-        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_downloads"))
-        .await
-}
-
-struct SecondaryDownloader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-}
-
-#[derive(Debug, Clone)]
-pub(super) struct OnDiskState {
-    metadata: LayerFileMetadata,
-    access_time: SystemTime,
-}
-
-impl OnDiskState {
-    fn new(
-        _conf: &'static PageServerConf,
-        _tenant_shard_id: &TenantShardId,
-        _imeline_id: &TimelineId,
-        _ame: LayerFileName,
-        metadata: LayerFileMetadata,
-        access_time: SystemTime,
-    ) -> Self {
-        Self {
-            metadata,
-            access_time,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Default)]
-pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
-
-    /// We remember when layers were evicted, to prevent re-downloading them.
-    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
-}
-
-/// This state is written by the secondary downloader, it is opaque
-/// to TenantManager
-#[derive(Debug)]
-pub(super) struct SecondaryDetail {
-    pub(super) config: SecondaryLocationConfig,
-
-    last_download: Option<Instant>,
-    next_download: Option<Instant>,
-    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
-}
-
-/// Helper for logging SystemTime
-fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
-    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
-    datetime.format("%d/%m/%Y %T")
-}
-
-impl SecondaryDetail {
-    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
-        Self {
-            config,
-            last_download: None,
-            next_download: None,
-            timelines: HashMap::new(),
-        }
-    }
-}
-
-struct PendingDownload {
-    secondary_state: Arc<SecondaryTenant>,
-    last_download: Option<Instant>,
-    target_time: Option<Instant>,
-    period: Option<Duration>,
-}
-
-impl scheduler::PendingJob for PendingDownload {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        self.secondary_state.get_tenant_shard_id()
-    }
-}
-
-struct RunningDownload {
-    barrier: Barrier,
-}
-
-impl scheduler::RunningJob for RunningDownload {
-    fn get_barrier(&self) -> Barrier {
-        self.barrier.clone()
-    }
-}
-
-struct CompleteDownload {
-    secondary_state: Arc<SecondaryTenant>,
-    completed_at: Instant,
-}
-
-impl scheduler::Completion for CompleteDownload {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        self.secondary_state.get_tenant_shard_id()
-    }
-}
-
-type Scheduler = TenantBackgroundJobs<
-    SecondaryDownloader,
-    PendingDownload,
-    RunningDownload,
-    CompleteDownload,
-    DownloadCommand,
->;
-
-#[async_trait::async_trait]
-impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
-    for SecondaryDownloader
-{
-    #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
-    fn on_completion(&mut self, completion: CompleteDownload) {
-        let CompleteDownload {
-            secondary_state,
-            completed_at: _completed_at,
-        } = completion;
-
-        tracing::debug!("Secondary tenant download completed");
-
-        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
-        // take priority to run again.
-        let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
-    }
-
-    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
-        let mut result = SchedulingResult {
-            jobs: Vec::new(),
-            want_interval: None,
-        };
-
-        // Step 1: identify some tenants that we may work on
-        let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
-        self.tenant_manager
-            .foreach_secondary_tenants(|_id, secondary_state| {
-                tenants.push(secondary_state.clone());
-            });
-
-        // Step 2: filter out tenants which are not yet elegible to run
-        let now = Instant::now();
-        result.jobs = tenants
-            .into_iter()
-            .filter_map(|secondary_tenant| {
-                let (last_download, next_download) = {
-                    let mut detail = secondary_tenant.detail.lock().unwrap();
-
-                    if !detail.config.warm {
-                        // Downloads are disabled for this tenant
-                        detail.next_download = None;
-                        return None;
-                    }
-
-                    if detail.next_download.is_none() {
-                        // Initialize with a jitter: this spreads initial downloads on startup
-                        // or mass-attach across our freshen interval.
-                        let jittered_period =
-                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
-                        detail.next_download = Some(now.checked_add(jittered_period).expect(
-                        "Using our constant, which is known to be small compared with clock range",
-                    ));
-                    }
-                    (detail.last_download, detail.next_download.unwrap())
-                };
-
-                if now < next_download {
-                    Some(PendingDownload {
-                        secondary_state: secondary_tenant,
-                        last_download,
-                        target_time: Some(next_download),
-                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
-                    })
-                } else {
-                    None
-                }
-            })
-            .collect();
-
-        // Step 3: sort by target execution time to run most urgent first.
-        result.jobs.sort_by_key(|j| j.target_time);
-
-        result
-    }
-
-    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
-        let tenant_shard_id = command.get_tenant_shard_id();
-
-        let tenant = self
-            .tenant_manager
-            .get_secondary_tenant_shard(*tenant_shard_id);
-        let Some(tenant) = tenant else {
-            {
-                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-            }
-        };
-
-        Ok(PendingDownload {
-            target_time: None,
-            period: None,
-            last_download: None,
-            secondary_state: tenant,
-        })
-    }
-
-    fn spawn(
-        &mut self,
-        job: PendingDownload,
-    ) -> (
-        RunningDownload,
-        Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
-    ) {
-        let PendingDownload {
-            secondary_state,
-            last_download,
-            target_time,
-            period,
-        } = job;
-
-        let (completion, barrier) = utils::completion::channel();
-        let remote_storage = self.remote_storage.clone();
-        let conf = self.tenant_manager.get_conf();
-        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
-        (RunningDownload { barrier }, Box::pin(async move {
-            let _completion = completion;
-
-            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
-                .download()
-                .await
-            {
-                Err(UpdateError::NoData) => {
-                    tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
-                },
-                Err(UpdateError::NoSpace) => {
-                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
-                }
-                Err(UpdateError::Cancelled) => {
-                    tracing::debug!("Shut down while downloading");
-                },
-                Err(UpdateError::Deserialize(e)) => {
-                    tracing::error!("Corrupt content while downloading tenant: {e}");
-                },
-                Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
-                    tracing::error!("Error while downloading tenant: {e}");
-                },
-                Ok(()) => {}
-            };
-
-            // Irrespective of the result, we will reschedule ourselves to run after our usual period.
-
-            // If the job had a target execution time, we may check our final execution
-            // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Only track execution lag if this isn't our first download: otherwise, it is expected
-                // that execution will have taken longer than our configured interval, for example
-                // when starting up a pageserver and
-                if last_download.is_some() {
-                    // Elapsed time includes any scheduling lag as well as the execution of the job
-                    let elapsed = Instant::now().duration_since(target_time);
-
-                    warn_when_period_overrun(
-                        elapsed,
-                        period,
-                        BackgroundLoopKind::SecondaryDownload,
-                    );
-                }
-            }
-
-            CompleteDownload {
-                    secondary_state,
-                    completed_at: Instant::now(),
-                }
-        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
-    }
-}
-
-/// This type is a convenience to group together the various functions involved in
-/// freshening a secondary tenant.
-struct TenantDownloader<'a> {
-    conf: &'static PageServerConf,
-    remote_storage: &'a GenericRemoteStorage,
-    secondary_state: &'a SecondaryTenant,
-}
-
-/// Errors that may be encountered while updating a tenant
-#[derive(thiserror::Error, Debug)]
-enum UpdateError {
-    #[error("No remote data found")]
-    NoData,
-    #[error("Insufficient local storage space")]
-    NoSpace,
-    #[error("Failed to download")]
-    DownloadError(DownloadError),
-    #[error(transparent)]
-    Deserialize(#[from] serde_json::Error),
-    #[error("Cancelled")]
-    Cancelled,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-impl From<DownloadError> for UpdateError {
-    fn from(value: DownloadError) -> Self {
-        match &value {
-            DownloadError::Cancelled => Self::Cancelled,
-            DownloadError::NotFound => Self::NoData,
-            _ => Self::DownloadError(value),
-        }
-    }
-}
-
-impl From<std::io::Error> for UpdateError {
-    fn from(value: std::io::Error) -> Self {
-        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
-            UpdateError::NoSpace
-        } else {
-            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
-            UpdateError::Other(anyhow::anyhow!(value))
-        }
-    }
-}
-
-impl<'a> TenantDownloader<'a> {
-    fn new(
-        conf: &'static PageServerConf,
-        remote_storage: &'a GenericRemoteStorage,
-        secondary_state: &'a SecondaryTenant,
-    ) -> Self {
-        Self {
-            conf,
-            remote_storage,
-            secondary_state,
-        }
-    }
-
-    async fn download(&self) -> Result<(), UpdateError> {
-        debug_assert_current_span_has_tenant_id();
-
-        // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
-        // cover our access to local storage.
-        let Ok(_guard) = self.secondary_state.gate.enter() else {
-            // Shutting down
-            return Ok(());
-        };
-
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // Download the tenant's heatmap
-        let heatmap_bytes = tokio::select!(
-            bytes = self.download_heatmap() => {bytes?},
-            _ = self.secondary_state.cancel.cancelled() => return Ok(())
-        );
-
-        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
-
-        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
-        // layer metadata without having to re-download it.
-        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
-
-        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
-        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
-        let heatmap_path_bg = heatmap_path.clone();
-        tokio::task::spawn_blocking(move || {
-            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
-            })
-        })
-        .await
-        .expect("Blocking task is never aborted")
-        .maybe_fatal_err(&context_msg)?;
-
-        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
-
-        // Download the layers in the heatmap
-        for timeline in heatmap.timelines {
-            if self.secondary_state.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline)
-                .instrument(tracing::info_span!(
-                    "secondary_download_timeline",
-                    tenant_id=%tenant_shard_id.tenant_id,
-                    shard_id=%tenant_shard_id.shard_slug(),
-                    %timeline_id
-                ))
-                .await?;
-        }
-
-        Ok(())
-    }
-
-    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
-        debug_assert_current_span_has_tenant_id();
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // TODO: make download conditional on ETag having changed since last download
-        // (https://github.com/neondatabase/neon/issues/6199)
-        tracing::debug!("Downloading heatmap for secondary tenant",);
-
-        let heatmap_path = remote_heatmap_path(tenant_shard_id);
-
-        let heatmap_bytes = backoff::retry(
-            || async {
-                let download = self
-                    .remote_storage
-                    .download(&heatmap_path)
-                    .await
-                    .map_err(UpdateError::from)?;
-                let mut heatmap_bytes = Vec::new();
-                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
-                Ok(heatmap_bytes)
-            },
-            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
-            FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "download heatmap",
-            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
-                UpdateError::Cancelled
-            }),
-        )
-        .await?;
-
-        SECONDARY_MODE.download_heatmap.inc();
-
-        Ok(heatmap_bytes)
-    }
-
-    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        let timeline_path = self
-            .conf
-            .timeline_path(tenant_shard_id, &timeline.timeline_id);
-
-        // Accumulate updates to the state
-        let mut touched = Vec::new();
-
-        // Clone a view of what layers already exist on disk
-        let timeline_state = self
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .timelines
-            .get(&timeline.timeline_id)
-            .cloned();
-
-        let timeline_state = match timeline_state {
-            Some(t) => t,
-            None => {
-                // We have no existing state: need to scan local disk for layers first.
-                let timeline_state =
-                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
-
-                // Re-acquire detail lock now that we're done with async load from local FS
-                self.secondary_state
-                    .detail
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(timeline.timeline_id, timeline_state.clone());
-                timeline_state
-            }
-        };
-
-        let layers_in_heatmap = timeline
-            .layers
-            .iter()
-            .map(|l| &l.name)
-            .collect::<HashSet<_>>();
-        let layers_on_disk = timeline_state
-            .on_disk_layers
-            .iter()
-            .map(|l| l.0)
-            .collect::<HashSet<_>>();
-
-        // Remove on-disk layers that are no longer present in heatmap
-        for layer in layers_on_disk.difference(&layers_in_heatmap) {
-            let local_path = timeline_path.join(layer.to_string());
-            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .maybe_fatal_err("Removing secondary layer")?;
-        }
-
-        // Download heatmap layers that are not present on local disk, or update their
-        // access time if they are already present.
-        for layer in timeline.layers {
-            if self.secondary_state.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            // Existing on-disk layers: just update their access time.
-            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
-                tracing::debug!("Layer {} is already on disk", layer.name);
-                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
-                    || on_disk.access_time != layer.access_time
-                {
-                    // We already have this layer on disk.  Update its access time.
-                    tracing::debug!(
-                        "Access time updated for layer {}: {} -> {}",
-                        layer.name,
-                        strftime(&on_disk.access_time),
-                        strftime(&layer.access_time)
-                    );
-                    touched.push(layer);
-                }
-                continue;
-            } else {
-                tracing::debug!("Layer {} not present on disk yet", layer.name);
-            }
-
-            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
-            // recently than it was evicted.
-            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
-                if &layer.access_time > evicted_at {
-                    tracing::info!(
-                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                } else {
-                    tracing::trace!(
-                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                    continue;
-                }
-            }
-
-            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-            let downloaded_bytes = match download_layer_file(
-                self.conf,
-                self.remote_storage,
-                *tenant_shard_id,
-                timeline.timeline_id,
-                &layer.name,
-                &LayerFileMetadata::from(&layer.metadata),
-                &self.secondary_state.cancel,
-            )
-            .await
-            {
-                Ok(bytes) => bytes,
-                Err(e) => {
-                    if let DownloadError::NotFound = e {
-                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                        // This is harmless: continue to download the next layer. It is expected during compaction
-                        // GC.
-                        tracing::debug!(
-                            "Skipped downloading missing layer {}, raced with compaction/gc?",
-                            layer.name
-                        );
-                        continue;
-                    } else {
-                        return Err(e.into());
-                    }
-                }
-            };
-
-            if downloaded_bytes != layer.metadata.file_size {
-                let local_path = timeline_path.join(layer.name.to_string());
-
-                tracing::warn!(
-                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
-                    layer.name,
-                    downloaded_bytes,
-                    layer.metadata.file_size
-                );
-
-                tokio::fs::remove_file(&local_path)
-                    .await
-                    .or_else(fs_ext::ignore_not_found)?;
-            }
-
-            SECONDARY_MODE.download_layer.inc();
-            touched.push(layer)
-        }
-
-        // Write updates to state to record layers we just downloaded or touched.
-        {
-            let mut detail = self.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
-
-            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
-
-            for t in touched {
-                use std::collections::hash_map::Entry;
-                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
-                    Entry::Occupied(mut v) => {
-                        v.get_mut().access_time = t.access_time;
-                    }
-                    Entry::Vacant(e) => {
-                        e.insert(OnDiskState::new(
-                            self.conf,
-                            tenant_shard_id,
-                            &timeline.timeline_id,
-                            t.name,
-                            LayerFileMetadata::from(&t.metadata),
-                            t.access_time,
-                        ));
-                    }
-                }
-            }
-        }
-
-        Ok(())
-    }
-}
-
-/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
-async fn init_timeline_state(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    heatmap: &HeatMapTimeline,
-) -> SecondaryDetailTimeline {
-    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
-    let mut detail = SecondaryDetailTimeline::default();
-
-    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
-        Ok(d) => d,
-        Err(e) => {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                let context = format!("Creating timeline directory {timeline_path}");
-                tracing::info!("{}", context);
-                tokio::fs::create_dir_all(&timeline_path)
-                    .await
-                    .fatal_err(&context);
-
-                // No entries to report: drop out.
-                return detail;
-            } else {
-                on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
-            }
-        }
-    };
-
-    // As we iterate through layers found on disk, we will look up their metadata from this map.
-    // Layers not present in metadata will be discarded.
-    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
-        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
-
-    while let Some(dentry) = dir
-        .next_entry()
-        .await
-        .fatal_err(&format!("Listing {timeline_path}"))
-    {
-        let dentry_file_name = dentry.file_name();
-        let file_name = dentry_file_name.to_string_lossy();
-        let local_meta = dentry.metadata().await.fatal_err(&format!(
-            "Read metadata on {}",
-            dentry.path().to_string_lossy()
-        ));
-
-        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
-        if file_name == METADATA_FILE_NAME {
-            continue;
-        }
-
-        match LayerFileName::from_str(&file_name) {
-            Ok(name) => {
-                let remote_meta = heatmap_metadata.get(&name);
-                match remote_meta {
-                    Some(remote_meta) => {
-                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if local_meta.len() != remote_meta.metadata.file_size {
-                            // This should not happen, because we do crashsafe write-then-rename when downloading
-                            // layers, and layers in remote storage are immutable.  Remove the local file because
-                            // we cannot trust it.
-                            tracing::warn!(
-                                "Removing local layer {name} with unexpected local size {} != {}",
-                                local_meta.len(),
-                                remote_meta.metadata.file_size
-                            );
-                        } else {
-                            // We expect the access time to be initialized immediately afterwards, when
-                            // the latest heatmap is applied to the state.
-                            detail.on_disk_layers.insert(
-                                name.clone(),
-                                OnDiskState::new(
-                                    conf,
-                                    tenant_shard_id,
-                                    &heatmap.timeline_id,
-                                    name,
-                                    LayerFileMetadata::from(&remote_meta.metadata),
-                                    remote_meta.access_time,
-                                ),
-                            );
-                        }
-                    }
-                    None => {
-                        // FIXME: consider some optimization when transitioning from attached to secondary: maybe
-                        // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
-                        // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
-                        tracing::info!(
-                            "Removing secondary local layer {} because it's absent in heatmap",
-                            name
-                        );
-                        tokio::fs::remove_file(&dentry.path())
-                            .await
-                            .or_else(fs_ext::ignore_not_found)
-                            .fatal_err(&format!(
-                                "Removing layer {}",
-                                dentry.path().to_string_lossy()
-                            ));
-                    }
-                }
-            }
-            Err(_) => {
-                // Ignore it.
-                tracing::warn!("Unexpected file in timeline directory: {file_name}");
-            }
-        }
-    }
-
-    detail
-}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,6 +1,5 @@
 use std::{
    collections::HashMap,
-    pin::Pin,
    sync::{Arc, Weak},
    time::{Duration, Instant},
 };
@@ -8,86 +7,35 @@ use std::{
 use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
-        config::AttachmentMode,
-        mgr::TenantManager,
-        remote_timeline_client::remote_heatmap_path,
-        span::debug_assert_current_span_has_tenant_id,
-        tasks::{warn_when_period_overrun, BackgroundLoopKind},
-        Tenant,
+        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
+        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
    },
 };

-use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::GenericRemoteStorage;

-use super::{
-    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
-    CommandRequest,
-};
+use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use tracing::instrument;
+use utils::{backoff, completion::Barrier};

-use super::{heatmap::HeatMapTenant, UploadCommand};
+use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};

-pub(super) async fn heatmap_uploader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) {
-    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
-
-    let generator = HeatmapUploader {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-        tenants: HashMap::new(),
-    };
-    let mut scheduler = Scheduler::new(generator, concurrency);
-
-    scheduler
-        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_uploader"))
-        .await
-}
-
-/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
-/// handling loop and mutates it as needed: there are no locks here, because that event loop
-/// can hold &mut references to this type throughout.
-struct HeatmapUploader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
-
-    tenants: HashMap<TenantShardId, UploaderTenantState>,
-}
+/// Period between heatmap uploader walking Tenants to look for work to do.
+/// If any tenants have a heatmap upload period lower than this, it will be adjusted
+/// downward to match.
+const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);

 struct WriteInProgress {
    barrier: Barrier,
 }

-impl RunningJob for WriteInProgress {
-    fn get_barrier(&self) -> Barrier {
-        self.barrier.clone()
-    }
-}
-
 struct UploadPending {
    tenant: Arc<Tenant>,
    last_digest: Option<md5::Digest>,
-    target_time: Option<Instant>,
-    period: Option<Duration>,
-}
-
-impl scheduler::PendingJob for UploadPending {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        self.tenant.get_tenant_shard_id()
-    }
 }

 struct WriteComplete {
@@ -97,12 +45,6 @@ struct WriteComplete {
    next_upload: Option<Instant>,
 }

-impl scheduler::Completion for WriteComplete {
-    fn get_tenant_shard_id(&self) -> &TenantShardId {
-        &self.tenant_shard_id
-    }
-}
-
 /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
@@ -126,111 +68,267 @@ struct UploaderTenantState {
    next_upload: Option<Instant>,
 }

-type Scheduler = TenantBackgroundJobs<
-    HeatmapUploader,
-    UploadPending,
-    WriteInProgress,
-    WriteComplete,
-    UploadCommand,
->;
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,

-#[async_trait::async_trait]
-impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
-    for HeatmapUploader
-{
-    async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+
+    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
+    /// limits permit it.
+    tenants_pending: std::collections::VecDeque<UploadPending>,
+
+    /// Tenants for which a task in `tasks` has been spawned.
+    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
+
+    tasks: JoinSet<()>,
+
+    /// Channel for our child tasks to send results to: we use a channel for results rather than
+    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
+    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
+    /// behavior.
+    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
+    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
+
+    concurrent_uploads: usize,
+
+    scheduling_interval: Duration,
+}
+
+/// The uploader task runs a loop that periodically wakes up and schedules tasks for
+/// tenants that require an upload, or handles any commands that have been sent into
+/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
+/// spawn.
+///
+/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
+/// all tenants that require an upload, and in between scheduling iterations we will
+/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
+///
+/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
+/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
+/// we might block waiting on a Tenant.
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+    let mut uploader = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tasks: JoinSet::new(),
+        tenants: HashMap::new(),
+        tenants_pending: std::collections::VecDeque::new(),
+        tenants_uploading: HashMap::new(),
+        task_result_tx: result_tx,
+        task_result_rx: result_rx,
+        concurrent_uploads,
+        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
+    };
+
+    tracing::info!("Waiting for background_jobs_can start...");
+    background_jobs_can_start.wait().await;
+    tracing::info!("background_jobs_can is ready, proceeding.");
+
+    while !cancel.is_cancelled() {
+        // Look for new work: this is relatively expensive because we have to go acquire the lock on
+        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+        // require an upload.
+        uploader.schedule_iteration().await?;
+
+        // Between scheduling iterations, we will:
+        //  - Drain any complete tasks and spawn pending tasks
+        //  - Handle incoming administrative commands
+        //  - Check our cancellation token
+        let next_scheduling_iteration = Instant::now()
+            .checked_add(uploader.scheduling_interval)
+            .unwrap_or_else(|| {
+                tracing::warn!(
+                    "Scheduling interval invalid ({}s), running immediately!",
+                    uploader.scheduling_interval.as_secs_f64()
+                );
+                Instant::now()
+            });
+        loop {
+            tokio::select! {
+                _ = cancel.cancelled() => {
+                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                    tracing::info!("Heatmap uploader joining tasks");
+                    while let Some(_r) = uploader.tasks.join_next().await {};
+                    tracing::info!("Heatmap uploader terminating");
+
+                    break;
+                },
+                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
+                    break;},
+                cmd = command_queue.recv() => {
+                    tracing::debug!("heatmap_uploader_task: woke for command queue");
+                    let cmd = match cmd {
+                        Some(c) =>c,
+                        None => {
+                            // SecondaryController was destroyed, and this has raced with
+                            // our CancellationToken
+                            tracing::info!("Heatmap uploader terminating");
+                            cancel.cancel();
+                            break;
+                        }
+                    };
+
+                    let CommandRequest{
+                        response_tx,
+                        payload
+                    } = cmd;
+                    uploader.handle_command(payload, response_tx);
+                },
+                _ = uploader.process_next_completion() => {
+                    if !cancel.is_cancelled() {
+                        uploader.spawn_pending();
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+impl HeatmapUploader {
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
        // Cull any entries in self.tenants whose Arc<Tenant> is gone
        self.tenants
            .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());

+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.tenants_pending.clear();
+
+        // Used a fixed 'now' through the following loop, for efficiency and fairness.
        let now = Instant::now();

-        let mut result = SchedulingResult {
-            jobs: Vec::new(),
-            want_interval: None,
-        };
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        const YIELD_ITERATIONS: usize = 1000;

+        // Iterate over tenants looking for work to do.
        let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-
-        yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
-            let period = match tenant.get_heatmap_period() {
-                None => {
-                    // Heatmaps are disabled for this tenant
-                    return;
-                }
-                Some(period) => {
-                    // If any tenant has asked for uploads more frequent than our scheduling interval,
-                    // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                    // we may set rather short intervals.
-                    result.want_interval = match result.want_interval {
-                        None => Some(period),
-                        Some(existing) => Some(std::cmp::min(period, existing)),
-                    };
-
-                    period
-                }
-            };
-
-            // Stale attachments do not upload anything: if we are in this state, there is probably some
-            // other attachment in mode Single or Multi running on another pageserver, and we don't
-            // want to thrash and overwrite their heatmap uploads.
-            if tenant.get_attach_mode() == AttachmentMode::Stale {
-                return;
+        for (i, tenant) in tenants.into_iter().enumerate() {
+            // Process is shutting down, drop out
+            if self.cancel.is_cancelled() {
+                return Ok(());
            }

-            // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-            // with the completion time in on_completion.
-            let state = self
-                .tenants
-                .entry(*tenant.get_tenant_shard_id())
-                .or_insert_with(|| {
-                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
-
-                    UploaderTenantState {
-                        tenant: Arc::downgrade(&tenant),
-                        last_upload: None,
-                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
-                        last_digest: None,
-                    }
-                });
-
-            // Decline to do the upload if insufficient time has passed
-            if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
-                return;
+            // Skip tenants that already have a write in flight
+            if self
+                .tenants_uploading
+                .contains_key(tenant.get_tenant_shard_id())
+            {
+                continue;
            }

-            let last_digest = state.last_digest;
-            result.jobs.push(UploadPending {
-                tenant,
-                last_digest,
-                target_time: state.next_upload,
-                period: Some(period),
-            });
-        })
-        .await
-        .ok();
+            self.maybe_schedule_upload(&now, tenant);

-        result
+            if i + 1 % YIELD_ITERATIONS == 0 {
+                tokio::task::yield_now().await;
+            }
+        }
+
+        // Spawn tasks for as many of our pending tenants as we can.
+        self.spawn_pending();
+
+        Ok(())
    }

-    fn spawn(
-        &mut self,
-        job: UploadPending,
-    ) -> (
-        WriteInProgress,
-        Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
-    ) {
-        let UploadPending {
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) {
+        match self.task_result_rx.recv().await {
+            Some(r) => {
+                self.on_completion(r);
+            }
+            None => {
+                unreachable!("Result sender is stored on Self");
+            }
+        }
+    }
+
+    /// The 'maybe' refers to the tenant's state: whether it is configured
+    /// for heatmap uploads at all, and whether sufficient time has passed
+    /// since the last upload.
+    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
+        match tenant.get_heatmap_period() {
+            None => {
+                // Heatmaps are disabled for this tenant
+                return;
+            }
+            Some(period) => {
+                // If any tenant has asked for uploads more frequent than our scheduling interval,
+                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                // we may set rather short intervals.
+                if period < self.scheduling_interval {
+                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
+                }
+            }
+        }
+
+        // Stale attachments do not upload anything: if we are in this state, there is probably some
+        // other attachment in mode Single or Multi running on another pageserver, and we don't
+        // want to thrash and overwrite their heatmap uploads.
+        if tenant.get_attach_mode() == AttachmentMode::Stale {
+            return;
+        }
+
+        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+        // with the completion time in on_completion.
+        let state = self
+            .tenants
+            .entry(*tenant.get_tenant_shard_id())
+            .or_insert_with(|| UploaderTenantState {
+                tenant: Arc::downgrade(&tenant),
+                last_upload: None,
+                next_upload: Some(Instant::now()),
+                last_digest: None,
+            });
+
+        // Decline to do the upload if insufficient time has passed
+        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
+            return;
+        }
+
+        let last_digest = state.last_digest;
+        self.tenants_pending.push_back(UploadPending {
            tenant,
            last_digest,
-            target_time,
-            period,
-        } = job;
+        })
+    }

+    fn spawn_pending(&mut self) {
+        while !self.tenants_pending.is_empty()
+            && self.tenants_uploading.len() < self.concurrent_uploads
+        {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.tenants_pending.pop_front().unwrap();
+            self.spawn_upload(pending.tenant, pending.last_digest);
+        }
+    }
+
+    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
        let remote_storage = self.remote_storage.clone();
-        let (completion, barrier) = utils::completion::channel();
        let tenant_shard_id = *tenant.get_tenant_shard_id();
-        (WriteInProgress { barrier }, Box::pin(async move {
+        let (completion, barrier) = utils::completion::channel();
+        let result_tx = self.task_result_tx.clone();
+        self.tasks.spawn(async move {
            // Guard for the barrier in [`WriteInProgress`]
            let _completion = completion;

@@ -264,47 +362,22 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            };

            let now = Instant::now();
-
-            // If the job had a target execution time, we may check our final execution
-            // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Elapsed time includes any scheduling lag as well as the execution of the job
-                let elapsed = now.duration_since(target_time);
-
-                warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
-            }
-
            let next_upload = tenant
                .get_heatmap_period()
                .and_then(|period| now.checked_add(period));

-            WriteComplete {
+            result_tx
+                .send(WriteComplete {
                    tenant_shard_id: *tenant.get_tenant_shard_id(),
                    completed_at: now,
                    digest,
                    next_upload,
-                }
-        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
-    }
+                })
+                .ok();
+        });

-    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
-        let tenant_shard_id = command.get_tenant_shard_id();
-
-        tracing::info!(
-            tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-            "Starting heatmap write on command");
-        let tenant = self
-            .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id, true)
-            .map_err(|e| anyhow::anyhow!(e))?;
-
-        Ok(UploadPending {
-            // Ignore our state for last digest: this forces an upload even if nothing has changed
-            last_digest: None,
-            tenant,
-            target_time: None,
-            period: None,
-        })
+        self.tenants_uploading
+            .insert(tenant_shard_id, WriteInProgress { barrier });
    }

    #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -316,6 +389,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            digest,
            next_upload,
        } = completion;
+        self.tenants_uploading.remove(&tenant_shard_id);
        use std::collections::hash_map::Entry;
        match self.tenants.entry(tenant_shard_id) {
            Entry::Vacant(_) => {
@@ -328,6 +402,69 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            }
        }
    }
+
+    fn handle_command(
+        &mut self,
+        command: UploadCommand,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        match command {
+            UploadCommand::Upload(tenant_shard_id) => {
+                // If an upload was ongoing for this tenant, let it finish first.
+                let barrier = if let Some(writing_state) =
+                    self.tenants_uploading.get(&tenant_shard_id)
+                {
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap write to complete");
+                    writing_state.barrier.clone()
+                } else {
+                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
+                    // starting of other background work.
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Starting heatmap write on command");
+                    let tenant = match self
+                        .tenant_manager
+                        .get_attached_tenant_shard(tenant_shard_id, true)
+                    {
+                        Ok(t) => t,
+                        Err(e) => {
+                            // Drop result of send: we don't care if caller dropped their receiver
+                            drop(response_tx.send(CommandResponse {
+                                result: Err(e.into()),
+                            }));
+                            return;
+                        }
+                    };
+                    self.spawn_upload(tenant, None);
+                    let writing_state = self
+                        .tenants_uploading
+                        .get(&tenant_shard_id)
+                        .expect("We just inserted this");
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap upload to complete");
+
+                    writing_state.barrier.clone()
+                };
+
+                // This task does no I/O: it only listens for a barrier's completion and then
+                // sends to the command response channel.  It is therefore safe to spawn this without
+                // any gates/task_mgr hooks.
+                tokio::task::spawn(async move {
+                    barrier.wait().await;
+
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Heatmap upload complete");
+
+                    // Drop result of send: we don't care if caller dropped their receiver
+                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
+                });
+            }
+        }
+    }
 }

 enum UploadHeatmapOutcome {
@@ -350,6 +487,7 @@ enum UploadHeatmapError {

 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
+#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
    tenant: &Arc<Tenant>,
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -1,361 +0,0 @@
-use async_trait;
-use futures::Future;
-use std::{
-    collections::HashMap,
-    marker::PhantomData,
-    pin::Pin,
-    time::{Duration, Instant},
-};
-
-use pageserver_api::shard::TenantShardId;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use utils::{completion::Barrier, yielding_loop::yielding_loop};
-
-use super::{CommandRequest, CommandResponse};
-
-/// Scheduling interval is the time between calls to JobGenerator::schedule.
-/// When we schedule jobs, the job generator may provide a hint of its preferred
-/// interval, which we will respect within these intervals.
-const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
-
-/// Scheduling helper for background work across many tenants.
-///
-/// Systems that need to run background work across many tenants may use this type
-/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
-/// implementation to provide the work to execute.  This is a simple scheduler that just
-/// polls the generator for outstanding work, replacing its queue of pending work with
-/// what the generator yields on each call: the job generator can change its mind about
-/// the order of jobs between calls.  The job generator is notified when jobs complete,
-/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
-/// admin APIs).
-///
-/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
-///
-/// G: A JobGenerator that this scheduler will poll to find pending jobs
-/// PJ: 'Pending Job': type for job descriptors that are ready to run
-/// RJ: 'Running Job' type' for jobs that have been spawned
-/// C : 'Completion' type that spawned jobs will send when they finish
-/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
-pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
-where
-    G: JobGenerator<PJ, RJ, C, CMD>,
-    C: Completion,
-    PJ: PendingJob,
-    RJ: RunningJob,
-{
-    generator: G,
-
-    /// Ready to run.  Will progress to `running` once concurrent limit is satisfied, or
-    /// be removed on next scheduling pass.
-    pending: std::collections::VecDeque<PJ>,
-
-    /// Tasks currently running in Self::tasks for these tenants.  Check this map
-    /// before pushing more work into pending for the same tenant.
-    running: HashMap<TenantShardId, RJ>,
-
-    tasks: JoinSet<C>,
-
-    concurrency: usize,
-
-    /// How often we would like schedule_interval to be called.
-    pub(super) scheduling_interval: Duration,
-
-    _phantom: PhantomData<(PJ, RJ, C, CMD)>,
-}
-
-#[async_trait::async_trait]
-pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
-where
-    C: Completion,
-    PJ: PendingJob,
-    RJ: RunningJob,
-{
-    /// Called at each scheduling interval.  Return a list of jobs to run, most urgent first.
-    ///
-    /// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
-    /// Implementations should take care to yield the executor periodically if running
-    /// very long loops.
-    ///
-    /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
-    /// jobs is not drained by the next scheduling interval, pending jobs will be cleared
-    /// and re-generated.
-    async fn schedule(&mut self) -> SchedulingResult<PJ>;
-
-    /// Called when a pending job is ready to be run.
-    ///
-    /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
-    fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
-
-    /// Called when a job previously spawned with spawn() transmits its completion
-    fn on_completion(&mut self, completion: C);
-
-    /// Called when a command is received.  A job will be spawned immediately if the return
-    /// value is Some, ignoring concurrency limits and the pending queue.
-    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
-}
-
-/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
-pub(super) struct SchedulingResult<PJ> {
-    pub(super) jobs: Vec<PJ>,
-    /// The job generator would like to be called again this soon
-    pub(super) want_interval: Option<Duration>,
-}
-
-/// See [`TenantBackgroundJobs`].
-pub(super) trait PendingJob {
-    fn get_tenant_shard_id(&self) -> &TenantShardId;
-}
-
-/// See [`TenantBackgroundJobs`].
-pub(super) trait Completion: Send + 'static {
-    fn get_tenant_shard_id(&self) -> &TenantShardId;
-}
-
-/// See [`TenantBackgroundJobs`].
-pub(super) trait RunningJob {
-    fn get_barrier(&self) -> Barrier;
-}
-
-impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
-where
-    C: Completion,
-    PJ: PendingJob,
-    RJ: RunningJob,
-    G: JobGenerator<PJ, RJ, C, CMD>,
-{
-    pub(super) fn new(generator: G, concurrency: usize) -> Self {
-        Self {
-            generator,
-            pending: std::collections::VecDeque::new(),
-            running: HashMap::new(),
-            tasks: JoinSet::new(),
-            concurrency,
-            scheduling_interval: MAX_SCHEDULING_INTERVAL,
-            _phantom: PhantomData,
-        }
-    }
-
-    pub(super) async fn run(
-        &mut self,
-        mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
-        background_jobs_can_start: Barrier,
-        cancel: CancellationToken,
-    ) {
-        tracing::info!("Waiting for background_jobs_can start...");
-        background_jobs_can_start.wait().await;
-        tracing::info!("background_jobs_can is ready, proceeding.");
-
-        while !cancel.is_cancelled() {
-            // Look for new work: this is relatively expensive because we have to go acquire the lock on
-            // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-            // require an upload.
-            self.schedule_iteration(&cancel).await;
-
-            if cancel.is_cancelled() {
-                return;
-            }
-
-            // Schedule some work, if concurrency limit permits it
-            self.spawn_pending();
-
-            // Between scheduling iterations, we will:
-            //  - Drain any complete tasks and spawn pending tasks
-            //  - Handle incoming administrative commands
-            //  - Check our cancellation token
-            let next_scheduling_iteration = Instant::now()
-                .checked_add(self.scheduling_interval)
-                .unwrap_or_else(|| {
-                    tracing::warn!(
-                        "Scheduling interval invalid ({}s)",
-                        self.scheduling_interval.as_secs_f64()
-                    );
-                    // unwrap(): this constant is small, cannot fail to add to time unless
-                    // we are close to the end of the universe.
-                    Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
-                });
-            loop {
-                tokio::select! {
-                    _ = cancel.cancelled() => {
-                        tracing::info!("joining tasks");
-                        // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                        // It is the callers responsibility to make sure that the tasks they scheduled
-                        // respect an appropriate cancellation token, to shut down promptly.  It is only
-                        // safe to wait on joining these tasks because we can see the cancellation token
-                        // has been set.
-                        while let Some(_r) = self.tasks.join_next().await {}
-                        tracing::info!("terminating on cancellation token.");
-
-                        break;
-                    },
-                    _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                        tracing::debug!("woke for scheduling interval");
-                        break;},
-                    cmd = command_queue.recv() => {
-                        tracing::debug!("woke for command queue");
-                        let cmd = match cmd {
-                            Some(c) =>c,
-                            None => {
-                                // SecondaryController was destroyed, and this has raced with
-                                // our CancellationToken
-                                tracing::info!("terminating on command queue destruction");
-                                cancel.cancel();
-                                break;
-                            }
-                        };
-
-                        let CommandRequest{
-                            response_tx,
-                            payload
-                        } = cmd;
-                        self.handle_command(payload, response_tx);
-                    },
-                    _ = async {
-                        let completion = self.process_next_completion().await;
-                        match completion {
-                            Some(c) => {
-                                self.generator.on_completion(c);
-                                if !cancel.is_cancelled() {
-                                    self.spawn_pending();
-                                }
-                            },
-                            None => {
-                                // Nothing is running, so just wait: expect that this future
-                                // will be dropped when something in the outer select! fires.
-                                cancel.cancelled().await;
-                            }
-                        }
-
-                     } => {}
-                }
-            }
-        }
-    }
-
-    fn do_spawn(&mut self, job: PJ) {
-        let tenant_shard_id = *job.get_tenant_shard_id();
-        let (in_progress, fut) = self.generator.spawn(job);
-
-        self.tasks.spawn(fut);
-
-        self.running.insert(tenant_shard_id, in_progress);
-    }
-
-    /// For all pending tenants that are elegible for execution, spawn their task.
-    ///
-    /// Caller provides the spawn operation, we track the resulting execution.
-    fn spawn_pending(&mut self) {
-        while !self.pending.is_empty() && self.running.len() < self.concurrency {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.pending.pop_front().unwrap();
-            self.do_spawn(pending);
-        }
-    }
-
-    /// For administrative commands: skip the pending queue, ignore concurrency limits
-    fn spawn_now(&mut self, job: PJ) -> &RJ {
-        let tenant_shard_id = *job.get_tenant_shard_id();
-        self.do_spawn(job);
-        self.running
-            .get(&tenant_shard_id)
-            .expect("We just inserted this")
-    }
-
-    /// Wait until the next task completes, and handle its completion
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) -> Option<C> {
-        match self.tasks.join_next().await {
-            Some(r) => {
-                // We use a channel to drive completions, but also
-                // need to drain the JoinSet to avoid completed tasks
-                // accumulating.  These calls are 1:1 because every task
-                // we spawn into this joinset submits is result to the channel.
-                let completion = r.expect("Panic in background task");
-
-                self.running.remove(completion.get_tenant_shard_id());
-                Some(completion)
-            }
-            None => {
-                // Nothing is running, so we have nothing to wait for.  We may drop out: the
-                // main even loop will call us again after the next time it has run something.
-                None
-            }
-        }
-    }
-
-    /// Convert the command into a pending job, spawn it, and when the spawned
-    /// job completes, send the result down `response_tx`.
-    fn handle_command(
-        &mut self,
-        cmd: CMD,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        let job = match self.generator.on_command(cmd) {
-            Ok(j) => j,
-            Err(e) => {
-                response_tx.send(CommandResponse { result: Err(e) }).ok();
-                return;
-            }
-        };
-
-        let tenant_shard_id = job.get_tenant_shard_id();
-        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            barrier
-        } else {
-            let running = self.spawn_now(job);
-            running.get_barrier().clone()
-        };
-
-        // This task does no I/O: it only listens for a barrier's completion and then
-        // sends to the command response channel.  It is therefore safe to spawn this without
-        // any gates/task_mgr hooks.
-        tokio::task::spawn(async move {
-            barrier.wait().await;
-
-            response_tx.send(CommandResponse { result: Ok(()) }).ok();
-        });
-    }
-
-    fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
-        self.running.get(tenant_shard_id).map(|r| r.get_barrier())
-    }
-
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    ///
-    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
-    ///
-    /// This function resets the pending list: it is assumed that the caller may change their mind about
-    /// which tenants need work between calls to schedule_iteration.
-    async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
-        let SchedulingResult {
-            jobs,
-            want_interval,
-        } = self.generator.schedule().await;
-
-        // Adjust interval based on feedback from the job generator
-        if let Some(want_interval) = want_interval {
-            // Calculation uses second granularity: this scheduler is not intended for high frequency tasks
-            self.scheduling_interval = Duration::from_secs(std::cmp::min(
-                std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
-                MAX_SCHEDULING_INTERVAL.as_secs(),
-            ));
-        }
-
-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.pending.clear();
-
-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        yielding_loop(1000, cancel, jobs.into_iter(), |job| {
-            // Skip tenants that already have a write in flight
-            if !self.running.contains_key(job.get_tenant_shard_id()) {
-                self.pending.push_back(job);
-            }
-        })
-        .await
-        .ok();
-    }
-}
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -770,7 +770,7 @@ impl DeltaLayerInner {
            .build();

        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = file.block_cursor();
+        let cursor = file.block_cursor_direct();
        let mut buf = Vec::new();
        for (entry_lsn, pos) in offsets {
            cursor
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -427,7 +427,7 @@ impl ImageLayerInner {
            .await?
        {
            let blob = file
-                .block_cursor()
+                .block_cursor_direct()
                .read_blob(
                    offset,
                    &RequestContextBuilder::extend(ctx)
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;

 use super::{DeltaLayerWriter, ResidentLayer};

@@ -246,43 +246,16 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub(crate) async fn put_value(
+    pub async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
        val: &Value,
        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
+        let inner: &mut _ = &mut *self.inner.write().await;
+        self.assert_writable();

        let off = {
            // Avoid doing allocations for "small" values.
@@ -291,7 +264,7 @@ impl InMemoryLayer {
            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
            buf.clear();
            val.ser_into(&mut buf)?;
-            locked_inner
+            inner
                .file
                .write_blob(
                    &buf,
@@ -302,7 +275,7 @@ impl InMemoryLayer {
                .await?
        };

-        let vec_map = locked_inner.index.entry(key).or_default();
+        let vec_map = inner.index.entry(key).or_default();
        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -312,11 +285,13 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
+
        Ok(())
    }

+    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -878,23 +878,6 @@ impl LayerInner {
                        Ok(())
                    }
                    Err(e) => {
-                        let consecutive_failures =
-                            this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
-
-                        let backoff = utils::backoff::exponential_backoff_duration_seconds(
-                            consecutive_failures.min(u32::MAX as usize) as u32,
-                            1.5,
-                            60.0,
-                        );
-
-                        let backoff = std::time::Duration::from_secs_f64(backoff);
-
-                        tokio::select! {
-                            _ = tokio::time::sleep(backoff) => {},
-                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
-                            _ = timeline.cancel.cancelled() => {},
-                        };
-
                        Err(e)
                    }
                };
@@ -943,9 +926,21 @@ impl LayerInner {
                Ok(permit)
            }
            Ok((Err(e), _permit)) => {
-                // sleep already happened in the spawned task, if it was not cancelled
-                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
+                // FIXME: this should be with the spawned task and be cancellation sensitive
+                //
+                // while we should not need this, this backoff has turned out to be useful with
+                // a bug of unexpectedly deleted remote layer file (#5787).
+                let consecutive_failures =
+                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
+                let backoff = utils::backoff::exponential_backoff_duration_seconds(
+                    consecutive_failures.min(u32::MAX as usize) as u32,
+                    1.5,
+                    60.0,
+                );
+                let backoff = std::time::Duration::from_secs_f64(backoff);
+
+                tokio::time::sleep(backoff).await;
                Err(DownloadError::DownloadFailed)
            }
            Err(_gone) => Err(DownloadError::DownloadCancelled),
@@ -1118,7 +1113,6 @@ impl LayerInner {
                        tracing::info!("evicted layer after unknown residence period");
                    }
                }
-                timeline.metrics.evictions.inc();
                timeline
                    .metrics
                    .resident_physical_size_sub(self.desc.file_size);
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -45,8 +45,6 @@ pub(crate) enum BackgroundLoopKind {
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
    InitialLogicalSizeCalculation,
-    HeatmapUpload,
-    SecondaryDownload,
 }

 impl BackgroundLoopKind {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::{
+    key::is_rel_block_key,
    models::{
        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
        TimelineState,
@@ -373,20 +374,15 @@ pub struct GcInfo {
 }

 /// An error happened in a get() operation.
-#[derive(thiserror::Error, Debug)]
-pub(crate) enum PageReconstructError {
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
    #[error(transparent)]
    Other(#[from] anyhow::Error),

-    #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(#[from] WaitLsnError),
-
    /// The operation was cancelled
-    #[error("Cancelled")]
    Cancelled,

    /// The ancestor of this is being stopped
-    #[error("ancestor timeline {0} is being stopped")]
    AncestorStopping(TimelineId),

    /// An error happened replaying WAL records
@@ -407,6 +403,32 @@ enum FlushLayerError {
    Other(#[from] anyhow::Error),
 }

+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+impl std::fmt::Display for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
    Initial,
@@ -431,21 +453,6 @@ impl std::fmt::Debug for Timeline {
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum WaitLsnError {
-    // Called on a timeline which is shutting down
-    #[error("Shutdown")]
-    Shutdown,
-
-    // Called on an timeline not in active state or shutting down
-    #[error("Bad state (not active)")]
-    BadState,
-
-    // Timeout expired while waiting for LSN to catch up with goal.
-    #[error("{0}")]
-    Timeout(String),
-}
-
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -480,7 +487,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn get(
+    pub async fn get(
        &self,
        key: Key,
        lsn: Lsn,
@@ -490,11 +497,6 @@ impl Timeline {
            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
        }

-        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
-        // already checked the key against the shard_identity when looking up the Timeline from
-        // page_service.
-        debug_assert!(!self.shard_identity.is_key_disposable(&key));
-
        // XXX: structured stats collection for layer eviction here.
        trace!(
            "get page request for {}@{} from task kind {:?}",
@@ -507,21 +509,25 @@ impl Timeline {
        // The cached image can be returned directly if there is no WAL between the cached image
        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
        // for redo.
-        let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
-            Some((cached_lsn, cached_img)) => {
-                match cached_lsn.cmp(&lsn) {
-                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => {
-                        MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
-                        return Ok(cached_img); // exact LSN match, return the image
-                    }
-                    Ordering::Greater => {
-                        unreachable!("the returned lsn should never be after the requested lsn")
+        let cached_page_img = if is_rel_block_key(&key) && key.field6 != 0xffffffff {
+            None
+        } else {
+            match self.lookup_cached_page(&key, lsn, ctx).await {
+                Some((cached_lsn, cached_img)) => {
+                    match cached_lsn.cmp(&lsn) {
+                        Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
+                        Ordering::Equal => {
+                            MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
+                            return Ok(cached_img); // exact LSN match, return the image
+                        }
+                        Ordering::Greater => {
+                            unreachable!("the returned lsn should never be after the requested lsn")
+                        }
                    }
+                    Some((cached_lsn, cached_img))
                }
-                Some((cached_lsn, cached_img))
+                None => None,
            }
-            None => None,
        };

        let mut reconstruct_state = ValueReconstructState {
@@ -628,28 +634,24 @@ impl Timeline {
    /// You should call this before any of the other get_* or list_* functions. Calling
    /// those functions with an LSN that has been processed yet is an error.
    ///
-    pub(crate) async fn wait_lsn(
+    pub async fn wait_lsn(
        &self,
        lsn: Lsn,
        _ctx: &RequestContext, /* Prepare for use by cancellation */
-    ) -> Result<(), WaitLsnError> {
-        if self.cancel.is_cancelled() {
-            return Err(WaitLsnError::Shutdown);
-        } else if !self.is_active() {
-            return Err(WaitLsnError::BadState);
-        }
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");

        // This should never be called from the WAL receiver, because that could lead
        // to a deadlock.
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
            "wait_lsn cannot be called in WAL receiver"
        );
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
            "wait_lsn cannot be called in WAL receiver"
        );
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
            "wait_lsn cannot be called in WAL receiver"
        );
@@ -663,22 +665,18 @@ impl Timeline {
        {
            Ok(()) => Ok(()),
            Err(e) => {
-                use utils::seqwait::SeqWaitError::*;
-                match e {
-                    Shutdown => Err(WaitLsnError::Shutdown),
-                    Timeout => {
-                        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
-                        drop(_timer);
-                        let walreceiver_status = self.walreceiver_status();
-                        Err(WaitLsnError::Timeout(format!(
+                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
+                drop(_timer);
+                let walreceiver_status = self.walreceiver_status();
+                Err(anyhow::Error::new(e).context({
+                    format!(
                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
                        lsn,
                        self.get_last_record_lsn(),
                        self.get_disk_consistent_lsn(),
                        walreceiver_status,
-                    )))
-                    }
-                }
+                    )
+                }))
            }
        }
    }
@@ -1466,7 +1464,6 @@ impl Timeline {
                max_lsn_wal_lag,
                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                availability_zone: self.conf.availability_zone.clone(),
-                ingest_batch_size: self.conf.ingest_batch_size,
            },
            broker_client,
            ctx,
@@ -2231,13 +2228,13 @@ impl Timeline {
                    return Err(layer_traversal_error(
                        if cfg!(test) {
                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
+                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                            )
                        } else {
                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
+                                "could not find data for key {} at LSN {}, for request at LSN {}",
+                                key, cont_lsn, request_lsn
                            )
                        },
                        traversal_path,
@@ -2297,12 +2294,11 @@ impl Timeline {
                ancestor
                    .wait_lsn(timeline.ancestor_lsn, ctx)
                    .await
-                    .map_err(|e| match e {
-                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
-                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
-                        e @ WaitLsnError::BadState => {
-                            PageReconstructError::Other(anyhow::anyhow!(e))
-                        }
+                    .with_context(|| {
+                        format!(
+                            "wait for lsn {} on ancestor timeline_id={}",
+                            timeline.ancestor_lsn, ancestor.timeline_id
+                        )
                    })?;

                timeline_owned = ancestor;
@@ -2480,27 +2476,9 @@ impl Timeline {
        Ok(())
    }

-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
+    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_tombstone(key_range, lsn).await?;
        Ok(())
    }

@@ -3062,15 +3040,6 @@ impl Timeline {
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
-                        if self.shard_identity.is_key_disposable(&key) {
-                            debug!(
-                                "Dropping key {} during compaction (it belongs on shard {:?})",
-                                key,
-                                self.shard_identity.get_shard_number(&key)
-                            );
-                            key = key.next();
-                            continue;
-                        }
                        let img = match self.get(key, lsn, ctx).await {
                            Ok(img) => img,
                            Err(err) => {
@@ -3097,7 +3066,6 @@ impl Timeline {
                                }
                            }
                        };
-
                        image_layer_writer.put_image(key, &img).await?;
                        key = key.next();
                    }
@@ -3668,15 +3636,7 @@ impl Timeline {
                )))
            });

-            if !self.shard_identity.is_key_disposable(&key) {
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
-            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
-            }
+            writer.as_mut().unwrap().put_value(key, lsn, value).await?;

            if !new_layers.is_empty() {
                fail_point!("after-timeline-compacted-first-L1");
@@ -4231,10 +4191,12 @@ impl Timeline {
                    .context("Failed to reconstruct a page image:")
                {
                    Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
+                    Err(e) => return Err(PageReconstructError::from(e)),
                };

-                if img.len() == page_cache::PAGE_SZ {
+                if img.len() == page_cache::PAGE_SZ
+                    && !(is_rel_block_key(&key) && key.field6 != 0xffffffff)
+                {
                    let cache = page_cache::get();
                    if let Err(e) = cache
                        .memorize_materialized_page(
@@ -4574,16 +4536,8 @@ impl<'a> TimelineWriter<'a> {
        self.tl.put_value(key, lsn, value, ctx).await
    }

-    pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
-    }
-
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        self.tl.put_tombstone(key_range, lsn).await
    }

    /// Track the end of the latest digested WAL record.
@@ -4594,11 +4548,11 @@ impl<'a> TimelineWriter<'a> {
    /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
    /// the latest and can be read.
-    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
+    pub fn finish_write(&self, new_lsn: Lsn) {
        self.tl.finish_write(new_lsn);
    }

-    pub(crate) fn update_current_logical_size(&self, delta: i64) {
+    pub fn update_current_logical_size(&self, delta: i64) {
        self.tl.update_current_logical_size(delta)
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -58,7 +58,6 @@ pub struct WalReceiverConf {
    pub max_lsn_wal_lag: NonZeroU64,
    pub auth_token: Option<Arc<String>>,
    pub availability_zone: Option<String>,
-    pub ingest_batch_size: u64,
 }

 pub struct WalReceiver {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -138,7 +138,7 @@ pub(super) async fn connection_manager_loop_step(
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                    Err(status) => {
                        match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
                                // tonic's error handling doesn't provide a clear code for disconnections: we get
                                // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
                                info!("broker disconnected: {status}");
@@ -411,7 +411,6 @@ impl ConnectionManagerState {

        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
-        let ingest_batch_size = self.conf.ingest_batch_size;
        let timeline = Arc::clone(&self.timeline);
        let ctx = ctx.detached_child(
            TaskKind::WalReceiverConnectionHandler,
@@ -431,7 +430,6 @@ impl ConnectionManagerState {
                    connect_timeout,
                    ctx,
                    node_id,
-                    ingest_batch_size,
                )
                .await;

@@ -1347,7 +1345,6 @@ mod tests {
                max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
                auth_token: None,
                availability_zone: None,
-                ingest_batch_size: 1,
            },
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
@@ -106,7 +106,6 @@ impl From<WalDecodeError> for WalReceiverError {

 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
-#[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
    timeline: Arc<Timeline>,
    wal_source_connconf: PgConnectionConfig,
@@ -115,7 +114,6 @@ pub(super) async fn handle_walreceiver_connection(
    connect_timeout: Duration,
    ctx: RequestContext,
    node: NodeId,
-    ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -307,9 +305,7 @@ pub(super) async fn handle_walreceiver_connection(

                {
                    let mut decoded = DecodedWALRecord::default();
-                    let mut modification = timeline.begin_modification(startlsn);
-                    let mut uncommitted_records = 0;
-                    let mut filtered_records = 0;
+                    let mut modification = timeline.begin_modification(endlsn);
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
@@ -318,40 +314,14 @@ pub(super) async fn handle_walreceiver_connection(
                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                        }

-                        // Ingest the records without immediately committing them.
-                        let ingested = walingest
+                        walingest
                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                            .await
                            .with_context(|| format!("could not ingest record at {lsn}"))?;
-                        if !ingested {
-                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
-                            WAL_INGEST.records_filtered.inc();
-                            filtered_records += 1;
-                        }

                        fail_point!("walreceiver-after-ingest");

                        last_rec_lsn = lsn;
-
-                        // Commit every ingest_batch_size records. Even if we filtered out
-                        // all records, we still need to call commit to advance the LSN.
-                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
-                            WAL_INGEST
-                                .records_committed
-                                .inc_by(uncommitted_records - filtered_records);
-                            modification.commit(&ctx).await?;
-                            uncommitted_records = 0;
-                            filtered_records = 0;
-                        }
-                    }
-
-                    // Commit the remaining records.
-                    if uncommitted_records > 0 {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(uncommitted_records - filtered_records);
-                        modification.commit(&ctx).await?;
                    }
                }

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,7 +29,6 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
-use utils::failpoint_support;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -48,18 +47,20 @@ use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;

-pub struct WalIngest {
+pub struct WalIngest<'a> {
    shard: ShardIdentity,
+    timeline: &'a Timeline,
+
    checkpoint: CheckPoint,
    checkpoint_modified: bool,
 }

-impl WalIngest {
+impl<'a> WalIngest<'a> {
    pub async fn new(
-        timeline: &Timeline,
+        timeline: &'a Timeline,
        startpoint: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<WalIngest> {
+        ctx: &'_ RequestContext,
+    ) -> anyhow::Result<WalIngest<'a>> {
        // Fetch the latest checkpoint into memory, so that we can compare with it
        // quickly in `ingest_record` and update it when it changes.
        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -68,6 +69,7 @@ impl WalIngest {

        Ok(WalIngest {
            shard: *timeline.get_shard_identity(),
+            timeline,
            checkpoint,
            checkpoint_modified: false,
        })
@@ -81,8 +83,6 @@ impl WalIngest {
    /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
    /// relations/pages that the record affects.
    ///
-    /// This function returns `true` if the record was ingested, and `false` if it was filtered out
-    ///
    pub async fn ingest_record(
        &mut self,
        recdata: Bytes,
@@ -90,13 +90,11 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
-    ) -> anyhow::Result<bool> {
+    ) -> anyhow::Result<()> {
        WAL_INGEST.records_received.inc();
-        let pg_version = modification.tline.pg_version;
-        let prev_len = modification.len();

-        modification.set_lsn(lsn)?;
-        decode_wal_record(recdata, decoded, pg_version)?;
+        modification.lsn = lsn;
+        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;

        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);
@@ -133,9 +131,9 @@ impl WalIngest {
            }
            pg_constants::RM_DBASE_ID => {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                debug!(%info, %pg_version, "handle RM_DBASE_ID");
+                debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");

-                if pg_version == 14 {
+                if self.timeline.pg_version == 14 {
                    if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
                        let createdb = XlCreateDatabase::decode(&mut buf);
                        debug!("XLOG_DBASE_CREATE v14");
@@ -151,7 +149,7 @@ impl WalIngest {
                                .await?;
                        }
                    }
-                } else if pg_version == 15 {
+                } else if self.timeline.pg_version == 15 {
                    if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                    } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -171,7 +169,7 @@ impl WalIngest {
                                .await?;
                        }
                    }
-                } else if pg_version == 16 {
+                } else if self.timeline.pg_version == 16 {
                    if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                    } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -346,7 +344,9 @@ impl WalIngest {
                        // particular point in the WAL. For more fine-grained control,
                        // we could peek into the message and only pause if it contains
                        // a particular string, for example, but this is enough for now.
-                        failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
+                        crate::failpoint_support::sleep_millis_async!(
+                            "wal-ingest-logical-message-sleep"
+                        );
                    } else if let Some(path) = prefix.strip_prefix("neon-file:") {
                        modification.put_file(path, message, ctx).await?;
                    }
@@ -400,11 +400,19 @@ impl WalIngest {
            self.checkpoint_modified = false;
        }

-        // Note that at this point this record is only cached in the modification
-        // until commit() is called to flush the data into the repository and update
-        // the latest LSN.
+        if modification.is_empty() {
+            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+            WAL_INGEST.records_filtered.inc();
+            modification.tline.finish_write(lsn);
+        } else {
+            WAL_INGEST.records_committed.inc();
+            modification.commit(ctx).await?;
+        }

-        Ok(modification.len() > prev_len)
+        // Now that this record has been fully handled, including updating the
+        // checkpoint data, let the repository know that it is up-to-date to this LSN.
+
+        Ok(())
    }

    /// Do not store this block, but observe it for the purposes of updating our relation size state.
@@ -451,7 +459,7 @@ impl WalIngest {
            && (decoded.xl_info == pg_constants::XLOG_FPI
                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
            // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
            // do not materialize null pages because them most likely be soon replaced with real data
            && blk.bimg_len != 0
        {
@@ -504,7 +512,7 @@ impl WalIngest {
        let mut old_heap_blkno: Option<u32> = None;
        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;

-        match modification.tline.pg_version {
+        match self.timeline.pg_version {
            14 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -728,7 +736,7 @@ impl WalIngest {
            // replaying it would fail to find the previous image of the page, because
            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
            // record if it doesn't.
-            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
            if let Some(blknum) = new_vm_blk {
                if blknum >= vm_size {
                    new_vm_blk = None;
@@ -809,11 +817,10 @@ impl WalIngest {
        let mut new_heap_blkno: Option<u32> = None;
        let mut old_heap_blkno: Option<u32> = None;
        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-        let pg_version = modification.tline.pg_version;

        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

-        match pg_version {
+        match self.timeline.pg_version {
            16 => {
                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -876,7 +883,7 @@ impl WalIngest {
            }
            _ => bail!(
                "Neon RMGR has no known compatibility with PostgreSQL version {}",
-                pg_version
+                self.timeline.pg_version
            ),
        }

@@ -899,7 +906,7 @@ impl WalIngest {
            // replaying it would fail to find the previous image of the page, because
            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
            // record if it doesn't.
-            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
            if let Some(blknum) = new_vm_blk {
                if blknum >= vm_size {
                    new_vm_blk = None;
@@ -977,14 +984,16 @@ impl WalIngest {
        let src_db_id = rec.src_db_id;
        let src_tablespace_id = rec.src_tablespace_id;

+        // Creating a database is implemented by copying the template (aka. source) database.
+        // To copy all the relations, we need to ask for the state as of the same LSN, but we
+        // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
+        // the last valid LSN to advance up to it. So we use the previous record's LSN in the
+        // get calls instead.
+        let req_lsn = modification.tline.get_last_record_lsn();
+
        let rels = modification
            .tline
-            .list_rels(
-                src_tablespace_id,
-                src_db_id,
-                Version::Modified(modification),
-                ctx,
-            )
+            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
            .await?;

        debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -992,12 +1001,7 @@ impl WalIngest {
        // Copy relfilemap
        let filemap = modification
            .tline
-            .get_relmap_file(
-                src_tablespace_id,
-                src_db_id,
-                Version::Modified(modification),
-                ctx,
-            )
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
            .await?;
        modification
            .put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1011,7 +1015,7 @@ impl WalIngest {

            let nblocks = modification
                .tline
-                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(src_rel, req_lsn, true, ctx)
                .await?;
            let dst_rel = RelTag {
                spcnode: tablespace_id,
@@ -1029,13 +1033,7 @@ impl WalIngest {

                let content = modification
                    .tline
-                    .get_rel_page_at_lsn(
-                        src_rel,
-                        blknum,
-                        Version::Modified(modification),
-                        true,
-                        ctx,
-                    )
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
                    .await?;
                modification.put_rel_page_image(dst_rel, blknum, content)?;
                num_blocks_copied += 1;
@@ -1106,7 +1104,7 @@ impl WalIngest {
                modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                fsm_physical_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
            if nblocks > fsm_physical_page_no {
                // check if something to do: FSM is larger than truncate position
                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1128,7 +1126,7 @@ impl WalIngest {
                modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                vm_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
            if nblocks > vm_page_no {
                // check if something to do: VM is larger than truncate position
                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1201,9 +1199,10 @@ impl WalIngest {
                    dbnode: xnode.dbnode,
                    relnode: xnode.relnode,
                };
+                let last_lsn = self.timeline.get_last_record_lsn();
                if modification
                    .tline
-                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+                    .get_rel_exists(rel, last_lsn, true, ctx)
                    .await?
                {
                    self.put_rel_drop(modification, rel, ctx).await?;
@@ -1257,9 +1256,10 @@ impl WalIngest {
        // will block waiting for the last valid LSN to advance up to
        // it. So we use the previous record's LSN in the get calls
        // instead.
+        let req_lsn = modification.tline.get_last_record_lsn();
        for segno in modification
            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
            .await?
        {
            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1471,6 +1471,20 @@ impl WalIngest {
        Ok(())
    }

+    async fn get_relsize(
+        &mut self,
+        rel: RelTag,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
+            0
+        } else {
+            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
+        };
+        Ok(nblocks)
+    }
+
    async fn handle_rel_extend(
        &mut self,
        modification: &mut DatadirModification<'_>,
@@ -1482,6 +1496,7 @@ impl WalIngest {
        // Check if the relation exists. We implicitly create relations on first
        // record.
        // TODO: would be nice if to be more explicit about it
+        let last_lsn = modification.lsn;

        // Get current size and put rel creation if rel doesn't exist
        //
@@ -1489,14 +1504,11 @@ impl WalIngest {
        //       check the cache too. This is because eagerly checking the cache results in
        //       less work overall and 10% better performance. It's more work on cache miss
        //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = modification
-            .tline
-            .get_cached_rel_size(&rel, modification.get_lsn())
-        {
+        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
            nblocks
-        } else if !modification
-            .tline
-            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        } else if !self
+            .timeline
+            .get_rel_exists(rel, last_lsn, true, ctx)
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
@@ -1506,10 +1518,7 @@ impl WalIngest {
                .context("Relation Error")?;
            0
        } else {
-            modification
-                .tline
-                .get_rel_size(rel, Version::Modified(modification), true, ctx)
-                .await?
+            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
        };

        if new_nblocks > old_nblocks {
@@ -1562,9 +1571,10 @@ impl WalIngest {
        // Check if the relation exists. We implicitly create relations on first
        // record.
        // TODO: would be nice if to be more explicit about it
-        let old_nblocks = if !modification
-            .tline
-            .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
+        let last_lsn = self.timeline.get_last_record_lsn();
+        let old_nblocks = if !self
+            .timeline
+            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
@@ -1573,9 +1583,8 @@ impl WalIngest {
                .await?;
            0
        } else {
-            modification
-                .tline
-                .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
+            self.timeline
+                .get_slru_segment_size(kind, segno, last_lsn, ctx)
                .await?
        };

@@ -1598,32 +1607,11 @@ impl WalIngest {
    }
 }

-async fn get_relsize(
-    modification: &DatadirModification<'_>,
-    rel: RelTag,
-    ctx: &RequestContext,
-) -> anyhow::Result<BlockNumber> {
-    let nblocks = if !modification
-        .tline
-        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
-        .await?
-    {
-        0
-    } else {
-        modification
-            .tline
-            .get_rel_size(rel, Version::Modified(modification), true, ctx)
-            .await?
-    };
-    Ok(nblocks)
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::tenant::harness::*;
-    use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
    use crate::tenant::Timeline;
    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
    use postgres_ffi::RELSEG_SIZE;
@@ -1644,7 +1632,10 @@ mod tests {

    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

-    async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
+    async fn init_walingest_test<'a>(
+        tline: &'a Timeline,
+        ctx: &RequestContext,
+    ) -> Result<WalIngest<'a>> {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1689,29 +1680,29 @@ mod tests {
        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
            .await
            .is_err());
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            1
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
                .await?,
            3
        );
@@ -1719,46 +1710,46 @@ mod tests {
        // Check page contents at each LSN
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 2")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1 at 4")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1 at 4")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 2 at 5")
        );
@@ -1774,19 +1765,19 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
                .await?,
            TEST_IMG("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1 at 4")
        );
@@ -1794,13 +1785,13 @@ mod tests {
        // should still see the truncated block with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
                .await?,
            3
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                .await?,
            TEST_IMG("foo blk 2 at 5")
        );
@@ -1813,7 +1804,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
                .await?,
            0
        );
@@ -1826,19 +1817,19 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
                .await?,
            ZERO_PAGE
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1")
        );
@@ -1851,21 +1842,21 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
                .await?,
            1501
        );
        for blk in 2..1500 {
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
                    .await?,
                ZERO_PAGE
            );
        }
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
                .await?,
            TEST_IMG("foo blk 1500")
        );
@@ -1892,13 +1883,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            1
        );
@@ -1911,7 +1902,7 @@ mod tests {
        // Check that rel is not visible anymore
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
                .await?,
            false
        );
@@ -1929,13 +1920,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
                .await?,
            1
        );
@@ -1968,24 +1959,24 @@ mod tests {
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
            .await
            .is_err());

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
                .await?,
            relsize
        );
@@ -1996,7 +1987,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2013,7 +2004,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
                .await?,
            1
        );
@@ -2023,7 +2014,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2032,7 +2023,7 @@ mod tests {
        // should still see all blocks with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
                .await?,
            relsize
        );
@@ -2041,7 +2032,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2061,13 +2052,13 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
                .await?,
            relsize
        );
@@ -2077,7 +2068,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
                    .await?,
                TEST_IMG(&data)
            );
@@ -2110,9 +2101,7 @@ mod tests {
        assert_current_logical_size(&tline, Lsn(lsn));

        assert_eq!(
-            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                .await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE + 1
        );

@@ -2124,9 +2113,7 @@ mod tests {
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
-            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                .await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
        );
        assert_current_logical_size(&tline, Lsn(lsn));
@@ -2139,9 +2126,7 @@ mod tests {
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
-            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                .await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
        );
        assert_current_logical_size(&tline, Lsn(lsn));
@@ -2157,9 +2142,7 @@ mod tests {
                .await?;
            m.commit(&ctx).await?;
            assert_eq!(
-                tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
-                    .await?,
+                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
            );

@@ -2194,25 +2177,21 @@ mod tests {
        let pg_version = 15; // The test data was generated by pg15
        let path = "test_data/sk_wal_segment_from_pgbench";
        let wal_segment_path = format!("{path}/000000010000000000000001.zst");
-        let source_initdb_path = format!("{path}/{INITDB_PATH}");
        let startpoint = Lsn::from_hex("14AEC08").unwrap();
-        let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
-
-        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
-        let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
-
-        std::fs::create_dir_all(initdb_path.parent().unwrap())
-            .expect("creating test dir should work");
-        std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works");
+        let endpoint = Lsn::from_hex("1FFFF98").unwrap();

        // Bootstrap a real timeline. We can't use create_test_timeline because
        // it doesn't create a real checkpoint, and Walingest::new tries to parse
        // the garbage data.
+        //
+        // TODO use the initdb.tar.zst file stored with the test data to avoid
+        //      problems with inconsistent initdb results after pg minor version bumps.
+        let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
+            .unwrap()
+            .load()
+            .await;
        let tline = tenant
-            .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx)
+            .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
            .await
            .unwrap();

@@ -2238,7 +2217,7 @@ mod tests {
        let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
            .await
            .unwrap();
-        let mut modification = tline.begin_modification(startpoint);
+        let mut modification = tline.begin_modification(endpoint);
        let mut decoded = DecodedWALRecord::default();
        println!("decoding {} bytes", bytes.len() - xlogoff);

@@ -2252,7 +2231,6 @@ mod tests {
                    .await
                    .unwrap();
            }
-            modification.commit(&ctx).await.unwrap();
        }

        let duration = started_at.elapsed();
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,7 +22,6 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
-use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
@@ -36,11 +35,14 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
+use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};

+#[cfg(feature = "testing")]
+use pageserver_api::shard::TenantShardId;
+
 use crate::config::PageServerConf;
 use crate::metrics::{
    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -90,7 +92,7 @@ struct ProcessOutput {
 /// records.
 ///
 pub struct PostgresRedoManager {
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -184,13 +186,10 @@ impl PostgresRedoManager {
    ///
    /// Create a new PostgresRedoManager.
    ///
-    pub fn new(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-    ) -> PostgresRedoManager {
+    pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
        // The actual process is launched lazily, on first request.
        PostgresRedoManager {
-            tenant_shard_id,
+            tenant_id,
            conf,
            last_redo_at: std::sync::Mutex::default(),
            redo_process: RwLock::new(None),
@@ -245,12 +244,8 @@ impl PostgresRedoManager {
                                let timer =
                                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                let proc = Arc::new(
-                                    WalRedoProcess::launch(
-                                        self.conf,
-                                        self.tenant_shard_id,
-                                        pg_version,
-                                    )
-                                    .context("launch walredo process")?,
+                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
+                                        .context("launch walredo process")?,
                                );
                                timer.observe_duration();
                                *proc_guard = Some(Arc::clone(&proc));
@@ -643,7 +638,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 struct WalRedoProcess {
    #[allow(dead_code)]
    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    // Some() on construction, only becomes None on Drop.
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
@@ -657,10 +652,10 @@ impl WalRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
    fn launch(
        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -685,7 +680,7 @@ impl WalRedoProcess {
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
            .close_fds()
-            .spawn_no_leak_child(tenant_shard_id)
+            .spawn_no_leak_child(tenant_id)
            .context("spawn process")?;
        WAL_REDO_PROCESS_COUNTERS.started.inc();
        let mut child = scopeguard::guard(child, |child| {
@@ -746,12 +741,12 @@ impl WalRedoProcess {
                        error!(error=?e, "failed to read from walredo stderr");
                    }
                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
        );

        Ok(Self {
            conf,
-            tenant_shard_id,
+            tenant_id,
            child: Some(child),
            stdin: Mutex::new(ProcessInput {
                stdin,
@@ -777,7 +772,7 @@ impl WalRedoProcess {
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
    fn apply_wal_records(
        &self,
        tag: BufferTag,
@@ -971,7 +966,11 @@ impl WalRedoProcess {
        // these files will be collected to an allure report
        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());

-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
+        let path = self
+            .conf
+            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
+            .join(&filename);

        let res = std::fs::OpenOptions::new()
            .write(true)
@@ -1005,7 +1004,7 @@ impl Drop for WalRedoProcess {
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
-    tenant_id: TenantShardId,
+    tenant_id: TenantId,
    child: Option<Child>,
 }

@@ -1024,7 +1023,7 @@ impl DerefMut for NoLeakChild {
 }

 impl NoLeakChild {
-    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
+    fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
        let child = command.spawn()?;
        Ok(NoLeakChild {
            tenant_id,
@@ -1079,7 +1078,7 @@ impl Drop for NoLeakChild {
            Some(child) => child,
            None => return,
        };
-        let tenant_shard_id = self.tenant_id;
+        let tenant_id = self.tenant_id;
        // Offload the kill+wait of the child process into the background.
        // If someone stops the runtime, we'll leak the child process.
        // We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1087,11 +1086,7 @@ impl Drop for NoLeakChild {
            tokio::task::spawn_blocking(move || {
                // Intentionally don't inherit the tracing context from whoever is dropping us.
                // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!(
-                    "walredo",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                );
+                let span = tracing::info_span!("walredo", %tenant_id);
                let _entered = span.enter();
                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
            })
@@ -1101,11 +1096,11 @@ impl Drop for NoLeakChild {
 }

 trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
 }

 impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
        NoLeakChild::spawn(tenant_id, self)
    }
 }
@@ -1160,7 +1155,6 @@ mod tests {
    use crate::repository::Key;
    use crate::{config::PageServerConf, walrecord::NeonWalRecord};
    use bytes::Bytes;
-    use pageserver_api::shard::TenantShardId;
    use std::str::FromStr;
    use utils::{id::TenantId, lsn::Lsn};

@@ -1270,9 +1264,9 @@ mod tests {
            let repo_dir = camino_tempfile::tempdir()?;
            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
            let conf = Box::leak(Box::new(conf));
-            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+            let tenant_id = TenantId::generate();

-            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+            let manager = PostgresRedoManager::new(conf, tenant_id);

            Ok(RedoHarness {
                _repo_dir: repo_dir,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	6e53637b06	neon_local: pin pageservers and endpoints	2024-01-03 16:14:58 +00:00
John Spray	45bd85c372	pageserver: skip the blk range check: this is slow (reads relsize page) if queries aren't hitting latest=true	2023-12-20 20:06:56 +00:00
John Spray	403aff42d7	timeline: skip materialized page cache for relation blocks	2023-12-20 20:06:46 +00:00
John Spray	fa113f8d40	page_cache: don't use a histogram (too expensive)	2023-12-20 20:05:06 +00:00
John Spray	8ec1b57a2c	pagebench: hack around bug	2023-12-20 20:03:23 +00:00
John Spray	a8ec7d7ad8	pageserver: prototype of skipping page cache for non-index block reads	2023-12-20 19:18:26 +00:00
Christian Schwarz	5f7e821a62	make CI happy	2023-12-20 15:53:21 +00:00
Christian Schwarz	c417a23dd0	pagebench: factor out the concept of thread local stats	2023-12-18 18:32:22 +00:00
Christian Schwarz	20e5e9dd16	pagebench: finish trigger initial logical size calculation benchmark	2023-12-18 18:32:22 +00:00
Christian Schwarz	24c72db5ff	pagebench: centralize target discovery	2023-12-18 18:32:22 +00:00
Christian Schwarz	6aee8511f7	pagebench: getpage: WIP: when auto-discovering timelines, add ability to limit	2023-12-18 18:32:22 +00:00
Christian Schwarz	ad2091bdd0	pagebench: WIP: command to trigger initial logical size calculation	2023-12-18 18:32:21 +00:00
Christian Schwarz	573d4752e6	pagebench: add a 'getpage@lsn' benchmark	2023-12-18 18:32:21 +00:00
Christian Schwarz	136bec6014	pagebench: add a 'basebackup' benchmark	2023-12-18 18:32:21 +00:00
Christian Schwarz	0f8b4faa50	pagebench: scaffold	2023-12-18 18:32:21 +00:00
Christian Schwarz	5b42949531	Merge branch 'main' into problame/benchmarking/pr/timeline-ids-in-tenant-details	2023-12-18 19:22:19 +01:00
Christian Schwarz	4a6dfb0ccb	include timeline ids in tenant details response	2023-12-18 15:12:48 +00:00