tweak comments

proxy: improve performance of leaky-bucket
2026-07-09 07:00:37 +00:00 · 2024-07-29 11:41:44 +01:00 · 2024-07-28 23:00:21 +01:00
130 changed files with 2339 additions and 7757 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,8 +8,6 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - BENCHMARK_PROJECT_ID_PUB
-  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,8 +14,11 @@ inputs:
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
+  provisioner:
+    description: 'k8s-pod or k8s-neonvm'
+    default: 'k8s-pod'
  compute_units:
-    description: '[Min, Max] compute units'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'

 outputs:
@@ -34,6 +37,10 @@ runs:
      # A shell without `set -x` to not to expose password/dsn in logs
      shell: bash -euo pipefail {0}
      run: |
+        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
+          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
+        fi
+
        project=$(curl \
          "https://${API_HOST}/api/v2/projects" \
          --fail \
@@ -45,7 +52,7 @@ runs:
              \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
              \"pg_version\": ${POSTGRES_VERSION},
              \"region_id\": \"${REGION_ID}\",
-              \"provisioner\": \"k8s-neonvm\",
+              \"provisioner\": \"${PROVISIONER}\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
              \"settings\": { }
@@ -68,5 +75,6 @@ runs:
        API_KEY: ${{ inputs.api_key }}
        REGION_ID: ${{ inputs.region_id }}
        POSTGRES_VERSION: ${{ inputs.postgres_version }}
+        PROVISIONER: ${{ inputs.provisioner }}
        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,10 +19,6 @@ on:
        description: 'debug or release'
        required: true
        type: string
-      pg-versions:
-        description: 'a json array of postgres versions to run regression tests on'
-        required: true
-        type: string

 defaults:
  run:
@@ -258,7 +254,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pg_version: ${{ fromJson(inputs.pg-versions) }}
+        pg_version: [ v14, v15, v16 ]
    steps:
      - uses: actions/checkout@v4
        with:
@@ -288,5 +284,5 @@ jobs:
      - name: Merge and upload coverage data
        if: |
          false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
+          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,9 +63,11 @@ jobs:
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
+            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -98,6 +100,7 @@ jobs:
        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -147,7 +150,7 @@ jobs:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -168,7 +171,7 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    - name: Run Logical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -176,15 +179,12 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
-        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}

-    - name: Run Physical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -216,11 +216,11 @@ jobs:
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
-    # - neonvm-captest-new: Freshly created project (1 CU)
-    # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neon-captest-new: Freshly created project (1 CU)
+    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
-    # - neonvm-captest-reuse: Reusing existing project
+    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
@@ -245,16 +245,18 @@ jobs:
            "'"$region_id_default"'"
            ],
          "platform": [
-            "neonvm-captest-new",
-            "neonvm-captest-reuse",
+            "neon-captest-new",
+            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

@@ -269,7 +271,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ]
        }'

@@ -285,7 +287,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ],
          "scale": [
            "10"
@@ -336,7 +338,7 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
@@ -344,18 +346,19 @@ jobs:
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
+        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -439,9 +442,9 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - PLATFORM: "neonvm-captest-pgvector"
+          - PLATFORM: "neon-captest-pgvector"
          - PLATFORM: "azure-captest-pgvector"
-
+            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -483,7 +486,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-pgvector)
+          neon-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
            ;;
          azure-captest-pgvector)
@@ -582,7 +585,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
            ;;
          rds-aurora)
@@ -592,7 +595,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -669,7 +672,7 @@ jobs:
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
@@ -679,7 +682,7 @@ jobs:
            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -756,7 +759,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
            ;;
          rds-aurora)
@@ -766,7 +769,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,8 +203,7 @@ jobs:
      fail-fast: false
      matrix:
        arch: [ x64 ]
-        # Do not build or run tests in debug for release branches
-        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
+        build-type: [ debug, release ]
        include:
          - build-type: release
            arch: arm64
@@ -214,8 +213,6 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -309,7 +306,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -836,9 +833,6 @@ jobs:
          rm -rf .docker-custom

  promote-images:
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

@@ -865,28 +859,6 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

-      - name: Azure login
-        if: github.ref_name == 'main'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: github.ref_name == 'main'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Copy docker images to ACR-dev
-        if: github.ref_name == 'main'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create \
-              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
-                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -13,7 +13,6 @@ on:
    paths:
      - '.github/workflows/pg-clients.yml'
      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
      - 'poetry.lock'
  workflow_dispatch:

@@ -50,101 +49,6 @@ jobs:
      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit

-  test-logical-replication:
-    needs: [ build-build-tools-image ]
-    runs-on: ubuntu-22.04
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init --user root
-    services:
-      clickhouse:
-        image: clickhouse/clickhouse-server:24.6.3.64
-        ports:
-          - 9000:9000
-          - 8123:8123
-      zookeeper:
-        image: quay.io/debezium/zookeeper:2.7
-        ports:
-          - 2181:2181
-      kafka:
-        image: quay.io/debezium/kafka:2.7
-        env:
-          ZOOKEEPER_CONNECT: "zookeeper:2181"
-          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
-          KAFKA_BROKER_ID: 1
-          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-          KAFKA_JMX_PORT: 9991
-        ports:
-          - 9092:9092
-      debezium:
-        image: quay.io/debezium/connect:2.7
-        env:
-          BOOTSTRAP_SERVERS: kafka:9092
-          GROUP_ID: 1
-          CONFIG_STORAGE_TOPIC: debezium-config
-          OFFSET_STORAGE_TOPIC: debezium-offset
-          STATUS_STORAGE_TOPIC: debezium-status
-          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
-        ports:
-          - 8083:8083
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Download Neon artifact
-        uses: ./.github/actions/download
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-          path: /tmp/neon/
-          prefix: latest
-
-      - name: Create Neon Project
-        id: create-neon-project
-        uses: ./.github/actions/neon-project-create
-        with:
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-      - name: Run tests
-        uses: ./.github/actions/run-python-test-set
-        with:
-          build_type: remote
-          test_selection: logical_repl
-          run_in_parallel: false
-          extra_params: -m remote_cluster
-          pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        env:
-          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-
-      - name: Delete Neon Project
-        if: always()
-        uses: ./.github/actions/neon-project-delete
-        with:
-          project_id: ${{ steps.create-neon-project.outputs.project_id }}
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-      - name: Create Allure report
-        if: ${{ !cancelled() }}
-        id: create-allure-report
-        uses: ./.github/actions/allure-report-generate
-        with:
-          store-test-results-into-db: true
-        env:
-          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-
-      - name: Post to a Slack channel
-        if: github.event.schedule && failure()
-        uses: slackapi/slack-github-action@v1
-        with:
-          channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
-          slack-message: |
-            Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  test-postgres-client-libs:
    needs: [ build-build-tools-image ]
    runs-on: ubuntu-22.04
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -13,6 +13,8 @@ defaults:
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  cancel-previous-e2e-tests:
@@ -62,35 +64,19 @@ jobs:
    needs: [ tag ]
    runs-on: ubuntu-22.04
    env:
-      EVENT_ACTION: ${{ github.event.action }}
-      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
-      - name: Wait for `promote-images` job to finish
-        # It's important to have a timeout here, the script in the step can run infinitely
-        timeout-minutes: 60
+      - name: check if ecr image are present
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
-          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
-            exit 0
-          fi
-
-          # For PRs we use the run id as the tag
-          BUILD_AND_TEST_RUN_ID=${TAG}
-          while true; do
-            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
-            case "$conclusion" in
-              success)
-                break
-                ;;
-              failure | cancelled | skipped)
-                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
-                exit 1
-                ;;
-              *)
-                echo "The 'promote-images' hasn't succeed yet. Waiting..."
-                sleep 60
-                ;;
-            esac
+          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
+            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
+            if [ "$OUTPUT" == "" ]; then
+              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
+              exit 1
+            fi
          done

      - name: Set e2e-platforms
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1418,7 +1418,7 @@ dependencies = [
 "clap",
 "criterion-plot",
 "is-terminal",
- "itertools 0.10.5",
+ "itertools",
 "num-traits",
 "once_cell",
 "oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
 "cast",
- "itertools 0.10.5",
+ "itertools",
 ]

 [[package]]
@@ -1672,7 +1672,6 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
- "chrono",
 "diesel_derives",
 "itoa",
 "pq-sys",
@@ -2134,12 +2133,6 @@ dependencies = [
 "slab",
 ]

-[[package]]
-name = "gen_ops"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
-
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2716,6 +2709,17 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"

+[[package]]
+name = "io-lifetimes"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "io-uring"
 version = "0.6.2"
@@ -2734,13 +2738,14 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"

 [[package]]
 name = "is-terminal"
-version = "0.4.12"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
 "hermit-abi",
- "libc",
- "windows-sys 0.52.0",
+ "io-lifetimes",
+ "rustix 0.37.25",
+ "windows-sys 0.48.0",
 ]

 [[package]]
@@ -2752,15 +2757,6 @@ dependencies = [
 "either",
 ]

-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itoa"
 version = "1.0.6"
@@ -2875,6 +2871,18 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"

+[[package]]
+name = "linux-raw-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -2992,7 +3000,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
- "procfs",
+ "procfs 0.16.0",
 ]

 [[package]]
@@ -3037,7 +3045,7 @@ dependencies = [
 "measured",
 "measured-process",
 "once_cell",
- "procfs",
+ "procfs 0.14.2",
 "prometheus",
 "rand 0.8.5",
 "rand_distr",
@@ -3566,7 +3574,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "leaky-bucket",
 "md5",
 "metrics",
@@ -3584,9 +3592,8 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs",
+ "procfs 0.14.2",
 "rand 0.8.5",
- "range-set-blaze",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
@@ -3637,7 +3644,7 @@ dependencies = [
 "hex",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
@@ -3695,7 +3702,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -4027,7 +4034,7 @@ name = "postgres_connection"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "postgres",
 "tokio-postgres",
@@ -4085,7 +4092,7 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools 0.10.5",
+ "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -4131,6 +4138,21 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "procfs"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
+dependencies = [
+ "bitflags 1.3.2",
+ "byteorder",
+ "chrono",
+ "flate2",
+ "hex",
+ "lazy_static",
+ "rustix 0.36.16",
+]
+
 [[package]]
 name = "procfs"
 version = "0.16.0"
@@ -4138,12 +4160,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
- "flate2",
 "hex",
 "lazy_static",
 "procfs-core",
- "rustix",
+ "rustix 0.38.28",
 ]

 [[package]]
@@ -4153,15 +4173,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
 "hex",
 ]

 [[package]]
 name = "prometheus"
-version = "0.13.4"
+version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
 dependencies = [
 "cfg-if",
 "fnv",
@@ -4169,7 +4188,7 @@ dependencies = [
 "libc",
 "memchr",
 "parking_lot 0.12.1",
- "procfs",
+ "procfs 0.14.2",
 "thiserror",
 ]

@@ -4191,7 +4210,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
 "bytes",
 "heck 0.4.1",
- "itertools 0.10.5",
+ "itertools",
 "lazy_static",
 "log",
 "multimap",
@@ -4212,7 +4231,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "proc-macro2",
 "quote",
 "syn 1.0.109",
@@ -4269,7 +4288,7 @@ dependencies = [
 "hyper-util",
 "indexmap 2.0.1",
 "ipnet",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "md5",
 "measured",
@@ -4445,18 +4464,6 @@ dependencies = [
 "rand_core 0.5.1",
 ]

-[[package]]
-name = "range-set-blaze"
-version = "0.1.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
-dependencies = [
- "gen_ops",
- "itertools 0.12.1",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -4625,7 +4632,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pin-project-lite",
@@ -4935,6 +4942,34 @@ dependencies = [
 "nom",
 ]

+[[package]]
+name = "rustix"
+version = "0.36.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.1.4",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "rustix"
 version = "0.38.28"
@@ -5683,7 +5718,6 @@ dependencies = [
 "aws-config",
 "bytes",
 "camino",
- "chrono",
 "clap",
 "control_plane",
 "diesel",
@@ -5694,7 +5728,7 @@ dependencies = [
 "hex",
 "humantime",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "measured",
 "metrics",
@@ -5703,7 +5737,6 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "r2d2",
- "rand 0.8.5",
 "reqwest 0.12.4",
 "routerify",
 "scopeguard",
@@ -5759,10 +5792,9 @@ dependencies = [
 "either",
 "futures",
 "futures-util",
- "git-version",
 "hex",
 "humantime",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "pageserver",
 "pageserver_api",
@@ -5939,15 +5971,15 @@ dependencies = [

 [[package]]
 name = "tempfile"
-version = "3.9.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
 dependencies = [
 "cfg-if",
- "fastrand 2.0.0",
- "redox_syscall 0.4.1",
- "rustix",
- "windows-sys 0.52.0",
+ "fastrand 1.9.0",
+ "redox_syscall 0.3.5",
+ "rustix 0.37.25",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -7144,6 +7176,15 @@ dependencies = [
 "windows_x86_64_msvc 0.42.2",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7162,6 +7203,21 @@ dependencies = [
 "windows-targets 0.52.4",
 ]

+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
@@ -7391,7 +7447,7 @@ dependencies = [
 "hmac",
 "hyper 0.14.26",
 "indexmap 1.9.3",
- "itertools 0.10.5",
+ "itertools",
 "libc",
 "log",
 "memchr",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.16"
+procfs = "0.14"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -933,8 +933,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -946,7 +945,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -961,7 +960,6 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,11 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test specific features.
-testing = []
-
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,15 +400,7 @@ impl ComputeNode {
    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let mut retry_period_ms = 500.0;
        let mut attempts = 0;
-        const DEFAULT_ATTEMPTS: u16 = 10;
-        #[cfg(feature = "testing")]
-        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
-            u16::from_str(&v).unwrap()
-        } else {
-            DEFAULT_ATTEMPTS
-        };
-        #[cfg(not(feature = "testing"))]
-        let max_attempts = DEFAULT_ATTEMPTS;
+        let max_attempts = 10;
        loop {
            let result = self.try_get_basebackup(compute_state, lsn);
            match result {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {

 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_") {
+        if var.starts_with("NEON_PAGESERVER_") {
            cmd = cmd.env(var, val);
        }
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -514,6 +514,7 @@ impl LocalEnv {
                #[derive(serde::Serialize, serde::Deserialize)]
                // (allow unknown fields, unlike PageServerConf)
                struct PageserverConfigTomlSubset {
+                    id: NodeId,
                    listen_pg_addr: String,
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
@@ -525,30 +526,18 @@ impl LocalEnv {
                        .with_context(|| format!("read {:?}", config_toml_path))?,
                )
                .context("parse pageserver.toml")?;
-                let identity_toml_path = dentry.path().join("identity.toml");
-                #[derive(serde::Serialize, serde::Deserialize)]
-                struct IdentityTomlSubset {
-                    id: NodeId,
-                }
-                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
-                    &std::fs::read_to_string(&identity_toml_path)
-                        .with_context(|| format!("read {:?}", identity_toml_path))?,
-                )
-                .context("parse identity.toml")?;
                let PageserverConfigTomlSubset {
+                    id: config_toml_id,
                    listen_pg_addr,
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
                } = config_toml;
-                let IdentityTomlSubset {
-                    id: identity_toml_id,
-                } = identity_toml;
                let conf = PageServerConf {
                    id: {
                        anyhow::ensure!(
-                            identity_toml_id == id,
-                            "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
+                            config_toml_id == id,
+                            "id mismatch: config_toml.id={config_toml_id} id={id}",
                        );
                        id
                    },
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -127,13 +127,10 @@ impl PageServerNode {
        }

        // Apply the user-provided overrides
-        overrides.push({
-            let mut doc =
-                toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
-            // `id` is written out to `identity.toml` instead of `pageserver.toml`
-            doc.remove("id").expect("it's part of the struct");
-            doc.to_string()
-        });
+        overrides.push(
+            toml_edit::ser::to_string_pretty(&conf)
+                .expect("we deserialized this from toml earlier"),
+        );

        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
            cleanup
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x

-cd /ext-src || exit 2
+cd /ext-src
 FAILED=
-LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d "${d}" ] || continue
+       [ -d ${d} ] || continue
    psql -c "select 1" >/dev/null || break
-       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo "${FAILED}"
+echo ${FAILED}
 exit 1
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,6 +1,5 @@
-use std::collections::HashSet;
 use std::str::FromStr;
-use std::time::{Duration, Instant};
+use std::time::Instant;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -295,42 +294,6 @@ pub enum PlacementPolicy {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

-/// Metadata health record posted from scrubber.
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthRecord {
-    pub tenant_shard_id: TenantShardId,
-    pub healthy: bool,
-    pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateRequest {
-    pub healthy_tenant_shards: HashSet<TenantShardId>,
-    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateResponse {}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListUnhealthyResponse {
-    pub unhealthy_tenant_shards: Vec<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedRequest {
-    #[serde(with = "humantime_serde")]
-    pub not_scrubbed_for: Duration,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedResponse {
-    pub health_records: Vec<MetadataHealthRecord>,
-}
-
 #[cfg(test)]
 mod test {
    use super::*;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,13 +637,6 @@ pub struct TenantInfo {
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
    pub generation: u32,
-
-    /// Opaque explanation if gc is being blocked.
-    ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub gc_blocking: Option<String>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -1434,7 +1427,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1457,7 +1449,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -355,8 +355,7 @@ impl RemoteStorage for AzureBlobStorage {
                    .blobs()
                    .map(|k| ListingObject{
                        key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into(),
-                        size: k.properties.content_length,
+                        last_modified: k.properties.last_modified.into()
                    }
                    );

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,7 +144,6 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
-#[derive(Copy, Clone)]
 pub enum ListingMode {
    WithDelimiter,
    NoDelimiter,
@@ -154,7 +153,6 @@ pub enum ListingMode {
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
-    pub size: u64,
 }

 #[derive(Default)]
@@ -196,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>>;

    async fn list(
        &self,
@@ -353,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
        match self {
            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -368,7 +368,6 @@ impl RemoteStorage for LocalFs {
                            key: k.clone(),
                            // LocalFs is just for testing, so just specify a dummy time
                            last_modified: SystemTime::now(),
-                            size: 0,
                        })
                    }
                })
@@ -412,7 +411,6 @@ impl RemoteStorage for LocalFs {
                            key: RemotePath::from_string(&relative_key).unwrap(),
                            // LocalFs is just for testing
                            last_modified: SystemTime::now(),
-                            size: 0,
                        });
                    }
                }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -565,12 +565,9 @@ impl RemoteStorage for S3Bucket {
                        }
                    };

-                    let size = object.size.unwrap_or(0) as u64;
-
                    result.keys.push(ListingObject{
                        key,
-                        last_modified,
-                        size,
+                        last_modified
                    });
                    if let Some(mut mk) = max_keys {
                        assert!(mk > 0);
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
        async_stream::stream! {
            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
                .map_err(DownloadError::Other)?;
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
-    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
+    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
    // TODO: join these two?
    Tenant,
-    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
-    /// Should only be used e.g. for status check/tenant creation/list.
+    // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+    // Should only be used e.g. for status check/tenant creation/list.
    PageServerApi,
-    /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
-    /// Should only be used e.g. for status check.
-    /// Currently also used for connection from any pageserver to any safekeeper.
+    // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+    // Should only be used e.g. for status check.
+    // Currently also used for connection from any pageserver to any safekeeper.
    SafekeeperData,
-    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
+    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    /// Allows access to control plane managment API and some storage controller endpoints.
+    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,

    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,7 +49,6 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
-range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
@@ -108,7 +107,3 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
-
-[[bench]]
-name = "bench_ingest"
-harness = false
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -1,235 +0,0 @@
-use std::{env, num::NonZeroUsize};
-
-use bytes::Bytes;
-use camino::Utf8PathBuf;
-use criterion::{criterion_group, criterion_main, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    context::{DownloadBehavior, RequestContext},
-    l0_flush::{L0FlushConfig, L0FlushGlobalState},
-    page_cache,
-    repository::Value,
-    task_mgr::TaskKind,
-    tenant::storage_layer::InMemoryLayer,
-    virtual_file::{self, api::IoEngineKind},
-};
-use pageserver_api::{key::Key, shard::TenantShardId};
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-};
-
-// A very cheap hash for generating non-sequential keys.
-fn murmurhash32(mut h: u32) -> u32 {
-    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
-    h ^= h >> 13;
-    h = h.wrapping_mul(0xc2b2ae35);
-    h ^= h >> 16;
-    h
-}
-
-enum KeyLayout {
-    /// Sequential unique keys
-    Sequential,
-    /// Random unique keys
-    Random,
-    /// Random keys, but only use the bits from the mask of them
-    RandomReuse(u32),
-}
-
-enum WriteDelta {
-    Yes,
-    No,
-}
-
-async fn ingest(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) -> anyhow::Result<()> {
-    let mut lsn = utils::lsn::Lsn(1000);
-    let mut key = Key::from_i128(0x0);
-
-    let timeline_id = TimelineId::generate();
-    let tenant_id = TenantId::generate();
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
-
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-
-    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
-
-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
-    let ctx = RequestContext::new(
-        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
-        pageserver::context::DownloadBehavior::Download,
-    );
-
-    for i in 0..put_count {
-        lsn += put_size as u64;
-
-        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
-        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
-        match key_layout {
-            KeyLayout::Sequential => {
-                // Use sequential order to illustrate the experience a user is likely to have
-                // when ingesting bulk data.
-                key.field6 = i as u32;
-            }
-            KeyLayout::Random => {
-                // Use random-order keys to avoid giving a false advantage to data structures that are
-                // faster when inserting on the end.
-                key.field6 = murmurhash32(i as u32);
-            }
-            KeyLayout::RandomReuse(mask) => {
-                // Use low bits only, to limit cardinality
-                key.field6 = murmurhash32(i as u32) & mask;
-            }
-        }
-
-        layer.put_value(key, lsn, &data, &ctx).await?;
-    }
-    layer.freeze(lsn + 1).await;
-
-    if matches!(write_delta, WriteDelta::Yes) {
-        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
-            max_concurrency: NonZeroUsize::new(1).unwrap(),
-        });
-        let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
-            .await?
-            .unwrap();
-        tokio::fs::remove_file(path).await?;
-    }
-
-    Ok(())
-}
-
-/// Wrapper to instantiate a tokio runtime
-fn ingest_main(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
-    runtime.block_on(async move {
-        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
-        if let Err(e) = r {
-            panic!("{e:?}");
-        }
-    });
-}
-
-/// Declare a series of benchmarks for the Pageserver's ingest write path.
-///
-/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
-/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
-///
-/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
-/// a fast disk, CPU is the bottleneck at time of writing.
-fn criterion_benchmark(c: &mut Criterion) {
-    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
-    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
-    eprintln!("Data directory: {}", temp_dir.path());
-
-    let conf: &'static PageServerConf = Box::leak(Box::new(
-        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
-    ));
-    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
-    page_cache::init(conf.page_cache_size);
-
-    {
-        let mut group = c.benchmark_group("ingest-small-values");
-        let put_size = 100usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/100b seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Random,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::RandomReuse(0x3ff),
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-
-    {
-        let mut group = c.benchmark_group("ingest-big-values");
-        let put_size = 8192usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/8k seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,4 +1,3 @@
-use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -16,11 +15,7 @@ use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

-use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
-
-fn fixture_path(relative: &str) -> PathBuf {
-    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-}
+use criterion::{black_box, criterion_group, criterion_main, Criterion};

 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    let mut layer_map = LayerMap::default();
@@ -114,7 +109,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
    // TODO consider compressing this file
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);

    // Test with uniform query pattern
@@ -144,7 +139,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
    // Init layer map
    let now = Instant::now();
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    println!("Finished layer map init in {:?}", now.elapsed());

    // Choose uniformly distributed queries
@@ -247,72 +242,7 @@ fn bench_sequential(c: &mut Criterion) {
    group.finish();
 }

-fn bench_visibility_with_map(
-    group: &mut BenchmarkGroup<WallTime>,
-    layer_map: LayerMap,
-    read_points: Vec<Lsn>,
-    bench_name: &str,
-) {
-    group.bench_function(bench_name, |b| {
-        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
-    });
-}
-
-// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
-fn bench_visibility(c: &mut Criterion) {
-    let mut group = c.benchmark_group("visibility");
-    {
-        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
-        let now = Instant::now();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for i in 0..100_000 {
-            let i32 = (i as u32) % 100;
-            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-            let layer = PersistentLayerDesc::new_img(
-                TenantShardId::unsharded(TenantId::generate()),
-                TimelineId::generate(),
-                zero.add(10 * i32)..zero.add(10 * i32 + 1),
-                Lsn(i),
-                0,
-            );
-            updates.insert_historic(layer);
-        }
-        updates.flush();
-        println!("Finished layer map init in {:?}", now.elapsed());
-
-        let mut read_points = Vec::new();
-        for i in (0..100_000).step_by(1000) {
-            read_points.push(Lsn(i));
-        }
-
-        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
-    }
-
-    {
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![Lsn(0x1C760FA190)];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
-
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![
-            Lsn(0x1C760FA190),
-            Lsn(0x000000931BEAD539),
-            Lsn(0x000000931BF63011),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x00000038E67ABFA0),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x000000914E3F38F0),
-            Lsn(0x000000931B33AE68),
-        ];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
-    }
-
-    group.finish();
-}
-
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_group!(group_4, bench_visibility);
-criterion_main!(group_1, group_2, group_3, group_4);
+criterion_main!(group_1, group_2, group_3);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -29,9 +31,11 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
-    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -589,13 +593,30 @@ fn start_pageserver(

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -623,7 +644,7 @@ fn start_pageserver(
            shutdown_pageserver.take();
            pageserver::shutdown_pageserver(
                http_endpoint_listener,
-                page_service,
+                libpq_listener,
                consumption_metrics_tasks,
                disk_usage_eviction_task,
                &tenant_manager,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -356,6 +356,8 @@ struct PageServerConfigBuilder {
    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

+    id: BuilderValue<NodeId>,
+
    broker_endpoint: BuilderValue<Uri>,
    broker_keepalive_interval: BuilderValue<Duration>,

@@ -404,8 +406,11 @@ struct PageServerConfigBuilder {
 }

 impl PageServerConfigBuilder {
-    fn new() -> Self {
-        Self::default()
+    fn new(node_id: NodeId) -> Self {
+        let mut this = Self::default();
+        this.id(node_id);
+
+        this
    }

    #[inline(always)]
@@ -433,6 +438,7 @@ impl PageServerConfigBuilder {
            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
            remote_storage_config: Set(None),
+            id: NotSet,
            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                .parse()
                .expect("failed to parse default broker endpoint")),
@@ -562,6 +568,10 @@ impl PageServerConfigBuilder {
        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
    }

+    pub fn id(&mut self, node_id: NodeId) {
+        self.id = BuilderValue::Set(node_id)
+    }
+
    pub fn log_format(&mut self, log_format: LogFormat) {
        self.log_format = BuilderValue::Set(log_format)
    }
@@ -673,7 +683,7 @@ impl PageServerConfigBuilder {
        self.l0_flush = BuilderValue::Set(value);
    }

-    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
+    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

        macro_rules! conf {
@@ -706,6 +716,7 @@ impl PageServerConfigBuilder {
                pg_auth_type,
                auth_validation_public_key_path,
                remote_storage_config,
+                id,
                broker_endpoint,
                broker_keepalive_interval,
                log_format,
@@ -733,7 +744,6 @@ impl PageServerConfigBuilder {
            }
            CUSTOM LOGIC
            {
-                id: id,
                // TenantConf is handled separately
                default_tenant_conf: TenantConf::default(),
                concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -883,7 +893,7 @@ impl PageServerConf {
        toml: &Document,
        workdir: &Utf8Path,
    ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new();
+        let mut builder = PageServerConfigBuilder::new(node_id);
        builder.workdir(workdir.to_owned());

        let mut t_conf = TenantConfOpt::default();
@@ -914,6 +924,8 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
+                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
+                            // Logging is not set up yet, so we can't do it.
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -1006,7 +1018,7 @@ impl PageServerConf {
            }
        }

-        let mut conf = builder.build(node_id).context("invalid config")?;
+        let mut conf = builder.build().context("invalid config")?;

        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
            let auth_validation_public_key_path = conf
@@ -1243,6 +1255,7 @@ max_file_descriptors = 333

 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
+id = 10

 metric_collection_interval = '222 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1259,8 +1272,9 @@ background_task_maximum_delay = '334 s'
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
        // we have to create dummy values to overcome the validation errors
-        let config_string =
-            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
+        let config_string = format!(
+            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
+        );
        let toml = config_string.parse()?;

        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1565,6 +1579,7 @@ broker_endpoint = '{broker_endpoint}'
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [disk_usage_based_eviction]
 max_usage_pct = 80
@@ -1634,6 +1649,7 @@ threshold = "20m"
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [tenant_config]
 evictions_low_residence_duration_metric_threshold = "20m"
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,45 +308,6 @@ paths:
            application/json:
              schema:
                type: string
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently add a gc blocking at the tenant level because of this timeline
-      responses:
-        "200":
-          description: OK
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently remove a tenant level gc blocking for this timeline
-      responses:
-        "200":
-          description: OK
-
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => {
                ApiError::ResourceUnavailable(format!("{}", e).into())
            }
-            GetActiveTenantError::SwitchedTenant => {
-                // in our HTTP handlers, this error doesn't happen
-                // TODO: separate error types
-                ApiError::ResourceUnavailable("switched tenant".into())
-            }
        }
    }
 }
@@ -935,7 +930,6 @@ async fn tenant_list_handler(
            generation: (*gen)
                .into()
                .expect("Tenants are always attached with a generation"),
-            gc_blocking: None,
        })
        .collect::<Vec<TenantInfo>>();

@@ -987,7 +981,6 @@ async fn tenant_status(
                    .generation()
                    .into()
                    .expect("Tenants are always attached with a generation"),
-                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1228,72 +1221,6 @@ async fn evict_timeline_layer_handler(
    }
 }

-async fn timeline_gc_blocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, true).await
-}
-
-async fn timeline_gc_unblocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, false).await
-}
-
-/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
-///
-/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
-async fn block_or_unblock_gc(
-    request: Request<Body>,
-    block: bool,
-) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::{
-        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
-    };
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let state = get_state(&request);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    let timeline = tenant.get_timeline(timeline_id, true)?;
-
-    let fut = async {
-        if block {
-            timeline.block_gc(&tenant).await.map(|_| ())
-        } else {
-            timeline.unblock_gc(&tenant).await
-        }
-    };
-
-    let span = tracing::info_span!(
-        "block_or_unblock_gc",
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %timeline_id,
-        block = block,
-    );
-
-    let res = fut.instrument(span).await;
-
-    res.map_err(|e| {
-        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
-            ApiError::ShuttingDown
-        } else {
-            ApiError::InternalServerError(e)
-        }
-    })?;
-
-    json_response(StatusCode::OK, ())
-}
-
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
    inputs: ModelInputs,
@@ -2202,24 +2129,14 @@ async fn secondary_download_handler(

    let timeout = wait.unwrap_or(Duration::MAX);

-    let result = tokio::time::timeout(
+    let status = match tokio::time::timeout(
        timeout,
        state.secondary_controller.download_tenant(tenant_shard_id),
    )
-    .await;
-
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
-    let status = match result {
-        Ok(Ok(())) => {
-            if progress.layers_downloaded >= progress.layers_total {
-                // Download job ran to completion
-                StatusCode::OK
-            } else {
-                // Download dropped out without errors because it ran out of time budget
-                StatusCode::ACCEPTED
-            }
-        }
+    .await
+    {
+        // Download job ran to completion.
+        Ok(Ok(())) => StatusCode::OK,
        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
        // okay.  We could get an error here in the unlikely edge case that the tenant
        // was detached between our check above and executing the download job.
@@ -2229,6 +2146,8 @@ async fn secondary_download_handler(
        Err(_) => StatusCode::ACCEPTED,
    };

+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
    json_response(status, progress)
 }

@@ -2972,14 +2891,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
-            |r| api_handler(r, timeline_gc_blocking_handler),
-        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
-            |r| api_handler(r, timeline_gc_unblocking_handler),
-        )
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -2,29 +2,19 @@ use std::{num::NonZeroUsize, sync::Arc};

 use crate::tenant::ephemeral_file;

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
+    #[default]
    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
-}
-
-impl Default for L0FlushConfig {
-    fn default() -> Self {
-        Self::Direct {
-            // TODO: using num_cpus results in different peak memory usage on different instance types.
-            max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
-        }
-    }
+    Direct { max_concurrency: NonZeroUsize },
 }

 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

-pub enum Inner {
+pub(crate) enum Inner {
    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +30,7 @@ impl L0FlushGlobalState {
        }
    }

-    pub fn inner(&self) -> &Arc<Inner> {
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
        &self.0
    }
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
-
-use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -32,13 +30,14 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
    mgr::{BackgroundPurges, TenantManager},
    secondary,
 };
-use tracing::{info, info_span};
+use tracing::info;

 /// Current storage format version
 ///
@@ -64,6 +63,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
-    page_service: page_service::Listener,
+    libpq_listener: LibpqEndpointListener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
    exit_code: i32,
 ) {
    use std::time::Duration;
-
-    // If the orderly shutdown below takes too long, we still want to make
-    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
-    //
-    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
-    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
-    //
-    // We use a thread instead of a tokio task because the background runtime is likely busy
-    // with the final flushing / uploads. This activity here has priority, and due to lack
-    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
-    // an effective priority booster.
-    let walredo_extraordinary_shutdown_thread_span = {
-        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
-        span.follows_from(tracing::Span::current());
-        span
-    };
-    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
-    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
-        let walredo_extraordinary_shutdown_thread_cancel =
-            walredo_extraordinary_shutdown_thread_cancel.clone();
-        move || {
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .unwrap();
-            let _entered = rt.enter();
-            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
-            if let Ok(()) = rt.block_on(tokio::time::timeout(
-                Duration::from_secs(8),
-                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
-            )) {
-                info!("cancellation requested");
-                return;
-            }
-            let managers = tenant::WALREDO_MANAGERS
-                .lock()
-                .unwrap()
-                // prevents new walredo managers from being inserted
-                .take()
-                .expect("only we take()");
-            // Use FuturesUnordered to get in queue early for each manager's
-            // heavier_once_cell semaphore wait list.
-            // Also, for idle tenants that for some reason haven't
-            // shut down yet, it's quite likely that we're not going
-            // to get Poll::Pending once.
-            let mut futs: FuturesUnordered<_> = managers
-                .into_iter()
-                .filter_map(|(_, mgr)| mgr.upgrade())
-                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
-                .collect();
-            info!(count=%futs.len(), "built FuturesUnordered");
-            let mut last_log_at = std::time::Instant::now();
-            #[derive(Debug, Default)]
-            struct Results {
-                initiated: u64,
-                already: u64,
-            }
-            let mut results = Results::default();
-            while let Some(we_initiated) = rt.block_on(futs.next()) {
-                if we_initiated {
-                    results.initiated += 1;
-                } else {
-                    results.already += 1;
-                }
-                if last_log_at.elapsed() > Duration::from_millis(100) {
-                    info!(remaining=%futs.len(), ?results, "progress");
-                    last_log_at = std::time::Instant::now();
-                }
-            }
-            info!(?results, "done");
-        }
-    });
-
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    let remaining_connections = timed(
-        page_service.stop_accepting(),
+    timed(
+        libpq_listener.0.shutdown(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
-        remaining_connections.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
        Duration::from_secs(1),
    )
    .await;
-
-    info!("cancel & join walredo_extraordinary_shutdown_thread");
-    walredo_extraordinary_shutdown_thread_cancel.cancel();
-    walredo_extraordinary_shutdown_thread.join().unwrap();
-    info!("walredo_extraordinary_shutdown_thread done");
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,15 +525,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_visible_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
        "pageserver_resident_physical_size_global",
@@ -622,23 +613,7 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_compression_image_in_bytes_total",
-        "Size of data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_considered",
-        "Size of potentially compressible data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_chosen",
-        "Size of data whose compressed form was written into image layers"
+        "Size of uncompressed data written into image layers"
    )
    .expect("failed to define a metric")
 });
@@ -2213,7 +2188,6 @@ pub(crate) struct TimelineMetrics {
    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
-    pub visible_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub aux_file_size_gauge: IntGauge,
@@ -2336,9 +2310,6 @@ impl TimelineMetrics {
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2393,7 +2364,6 @@ impl TimelineMetrics {
            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
-            visible_physical_size_gauge,
            current_logical_size_gauge,
            aux_file_size_gauge,
            directory_entries_count_gauge,
@@ -2445,7 +2415,6 @@ impl TimelineMetrics {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
-        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,7 +33,6 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
-use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -148,7 +147,6 @@ pub(crate) mod timeline;

 pub mod size;

-mod gc_block;
 pub(crate) mod throttle;

 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -304,12 +302,6 @@ pub struct Tenant {
    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

-    /// `index_part.json` based gc blocking reason tracking.
-    ///
-    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
-    /// proceeding.
-    pub(crate) gc_block: gc_block::GcBlock,
-
    l0_flush_global_state: L0FlushGlobalState,
 }

@@ -320,66 +312,14 @@ impl std::fmt::Debug for Tenant {
 }

 pub(crate) enum WalRedoManager {
-    Prod(WalredoManagerId, PostgresRedoManager),
+    Prod(PostgresRedoManager),
    #[cfg(test)]
    Test(harness::TestRedoManager),
 }

-#[derive(thiserror::Error, Debug)]
-#[error("pageserver is shutting down")]
-pub(crate) struct GlobalShutDown;
-
-impl WalRedoManager {
-    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
-        let id = WalredoManagerId::next();
-        let arc = Arc::new(Self::Prod(id, mgr));
-        let mut guard = WALREDO_MANAGERS.lock().unwrap();
-        match &mut *guard {
-            Some(map) => {
-                map.insert(id, Arc::downgrade(&arc));
-                Ok(arc)
-            }
-            None => Err(GlobalShutDown),
-        }
-    }
-}
-
-impl Drop for WalRedoManager {
-    fn drop(&mut self) {
-        match self {
-            Self::Prod(id, _) => {
-                let mut guard = WALREDO_MANAGERS.lock().unwrap();
-                if let Some(map) = &mut *guard {
-                    map.remove(id).expect("new() registers, drop() unregisters");
-                }
-            }
-            #[cfg(test)]
-            Self::Test(_) => {
-                // Not applicable to test redo manager
-            }
-        }
-    }
-}
-
-/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
-/// the walredo processes outside of the regular order.
-///
-/// This is necessary to work around a systemd bug where it freezes if there are
-/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
-#[allow(clippy::type_complexity)]
-pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
-    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
-> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
-#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
-pub(crate) struct WalredoManagerId(u64);
-impl WalredoManagerId {
-    pub fn next() -> Self {
-        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
-        }
-        Self(id)
+impl From<PostgresRedoManager> for WalRedoManager {
+    fn from(mgr: PostgresRedoManager) -> Self {
+        Self::Prod(mgr)
    }
 }

@@ -391,20 +331,19 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
-    pub(crate) async fn shutdown(&self) -> bool {
+    pub(crate) async fn shutdown(&self) {
        match self {
-            Self::Prod(_, mgr) => mgr.shutdown().await,
+            Self::Prod(mgr) => mgr.shutdown().await,
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
-                true
            }
        }
    }

    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
-            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
+            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
@@ -424,7 +363,7 @@ impl WalRedoManager {
        pg_version: u32,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
-            Self::Prod(_, mgr) => {
+            Self::Prod(mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
                    .await
            }
@@ -438,7 +377,7 @@ impl WalRedoManager {

    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(_, m) => Some(m.status()),
+            WalRedoManager::Prod(m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -447,8 +386,6 @@ impl WalRedoManager {

 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
-    #[error("Timeline is shutting down")]
-    ShuttingDown,
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
        tenant_id: TenantShardId,
@@ -738,9 +675,11 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
-        let wal_redo_manager =
-            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
+    ) -> Arc<Tenant> {
+        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+            conf,
+            tenant_shard_id,
+        )));

        let TenantSharedResources {
            broker_client,
@@ -939,7 +878,7 @@ impl Tenant {
            }
            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
-        Ok(tenant)
+        tenant
    }

    #[instrument(skip_all)]
@@ -1043,8 +982,6 @@ impl Tenant {
            }
        }

-        let mut gc_blocks = HashMap::new();
-
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
@@ -1054,16 +991,6 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

-            if let Some(blocking) = index_part.gc_blocking.as_ref() {
-                // could just filter these away, but it helps while testing
-                anyhow::ensure!(
-                    !blocking.reasons.is_empty(),
-                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
-                );
-                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
-                assert!(prev.is_none());
-            }
-
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
@@ -1108,8 +1035,6 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        self.gc_block.set_scanned(gc_blocks);
-
        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
@@ -1655,7 +1580,7 @@ impl Tenant {
        self: Arc<Self>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id).await?;
+        DeleteTimelineFlow::run(&self, timeline_id, false).await?;

        Ok(())
    }
@@ -1700,14 +1625,6 @@ impl Tenant {
            }
        }

-        let _guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(reasons) => {
-                info!("Skipping GC: {reasons}");
-                return Ok(GcResult::default());
-            }
-        };
-
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }
@@ -2720,7 +2637,6 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
-            gc_block: Default::default(),
            l0_flush_global_state,
        }
    }
@@ -4122,7 +4038,7 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
-    use std::collections::{BTreeMap, BTreeSet};
+    use std::collections::BTreeMap;

    use super::*;
    use crate::keyspace::KeySpaceAccum;
@@ -4797,7 +4713,7 @@ mod tests {
        lsn: Lsn,
        repeat: usize,
        key_count: usize,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+    ) -> anyhow::Result<()> {
        let compact = true;
        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
    }
@@ -4810,9 +4726,7 @@ mod tests {
        repeat: usize,
        key_count: usize,
        compact: bool,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
-        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
-
+    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;

@@ -4833,7 +4747,6 @@ mod tests {
                        ctx,
                    )
                    .await?;
-                inserted.entry(test_key).or_default().insert(lsn);
                writer.finish_write(lsn);
                drop(writer);

@@ -4858,7 +4771,7 @@ mod tests {
            assert_eq!(res.layers_removed, 0, "this never removes anything");
        }

-        Ok(inserted)
+        Ok(())
    }

    //
@@ -4905,7 +4818,7 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
        guard.layer_map().dump(true, &ctx).await?;
@@ -4966,39 +4879,9 @@ mod tests {
                    &ctx,
                )
                .await;
-
-            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
-            let mut expect_missing = false;
-            let mut key = read.start().unwrap();
-            while key != read.end().unwrap() {
-                if let Some(lsns) = inserted.get(&key) {
-                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
-                    match expected_lsn {
-                        Some(lsn) => {
-                            expected_lsns.insert(key, *lsn);
-                        }
-                        None => {
-                            expect_missing = true;
-                            break;
-                        }
-                    }
-                } else {
-                    expect_missing = true;
-                    break;
-                }
-
-                key = key.next();
-            }
-
-            if expect_missing {
-                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
-            } else {
-                for (key, image) in vectored_res? {
-                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
-                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
-                    assert_eq!(image?, expected_image);
-                }
-            }
+            tline
+                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
+                .await;
        }

        Ok(())
@@ -5048,6 +4931,10 @@ mod tests {
            )
            .await;

+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
        let images = vectored_res?;
        assert!(images.is_empty());
        Ok(())
@@ -6958,10 +6845,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
@@ -7025,11 +6909,7 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: {
-                        let mut key = Key::MAX;
-                        key.field6 -= 1;
-                        Key::MIN..key
-                    },
+                    key_range: Key::MIN..Key::MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
@@ -7048,18 +6928,6 @@ mod tests {
            ]
        );

-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
-            guard.cutoffs.space = Lsn(0x40);
-        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-
        Ok(())
    }

@@ -7392,10 +7260,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for idx in 0..10 {
            assert_eq!(
@@ -7414,18 +7279,6 @@ mod tests {
            );
        }

-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
-            guard.cutoffs.space = Lsn(0x40);
-        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-
        Ok(())
    }

@@ -7494,7 +7347,6 @@ mod tests {
                Lsn(0x60),
                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                3,
-                None,
            )
            .await
            .unwrap();
@@ -7619,7 +7471,7 @@ mod tests {
            ),
        ];
        let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
@@ -7665,114 +7517,6 @@ mod tests {
        };
        assert_eq!(res, expected_res);

-        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
-        // the ancestor image in the test case.
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![(
-                Lsn(0x60),
-                KeyLogAtLsn(vec![(
-                    Lsn(0x60),
-                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
-                )]),
-            )],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x60),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[Lsn(0x30)],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![
-                (
-                    Lsn(0x30),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x20),
-                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-                    )]),
-                ),
-                (
-                    Lsn(0x60),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x60),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
-                    )]),
-                ),
-            ],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
        Ok(())
    }

@@ -7930,10 +7674,6 @@ mod tests {
        ];

        let verify_result = || async {
-            let gc_horizon = {
-                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
-            };
            for idx in 0..10 {
                assert_eq!(
                    tline
@@ -7944,7 +7684,7 @@ mod tests {
                );
                assert_eq!(
                    tline
-                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
@@ -7969,232 +7709,7 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        let mut dryrun_flags = EnumSet::new();
-        dryrun_flags.insert(CompactFlags::DryRun);
-
-        tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
-            .await
-            .unwrap();
-        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
-        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
-        verify_result().await;
-
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        // compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x38);
-            guard.cutoffs.space = Lsn(0x38);
-        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
-
-        // not increasing the GC horizon and compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x28),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(5),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(6),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-        ];
-
-        let parent_tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![],                       // delta layers
-                vec![(Lsn(0x18), img_layer)], // image layers
-                Lsn(0x18),
-            )
-            .await?;
-
-        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        let branch_tline = tenant
-            .branch_timeline_test_with_layers(
-                &parent_tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x18)),
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-
-        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        {
-            // Update GC info
-            let mut guard = parent_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x10),
-                    space: Lsn(0x10),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        {
-            // Update GC info
-            let mut guard = branch_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x50),
-                    space: Lsn(0x50),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        let expected_result_at_gc_horizon = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
-        ];
-
-        let expected_result_at_lsn_40 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let verify_result = || async {
-            for idx in 0..10 {
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_gc_horizon[idx]
-                );
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_40[idx]
-                );
-            }
-        };
-
-        verify_result().await;
-
-        let cancel = CancellationToken::new();
-        branch_tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        verify_result().await;

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,12 +28,6 @@ use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-#[derive(Copy, Clone, Debug)]
-pub struct CompressionInfo {
-    pub written_compressed: bool,
-    pub compressed_size: Option<usize>,
-}
-
 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
@@ -279,10 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        let (buf, res) = self
-            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await;
-        (buf, res.map(|(off, _compression_info)| off))
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -292,12 +284,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
        algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
+    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;
-        let mut compression_info = CompressionInfo {
-            written_compressed: false,
-            compressed_size: None,
-        };

        let len = srcbuf.bytes_init();

@@ -340,9 +328,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        encoder.write_all(&slice[..]).await.unwrap();
                        encoder.shutdown().await.unwrap();
                        let compressed = encoder.into_inner();
-                        compression_info.compressed_size = Some(compressed.len());
                        if compressed.len() < len {
-                            compression_info.written_compressed = true;
                            let compressed_len = compressed.len();
                            compressed_buf = Some(compressed);
                            (BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -373,7 +359,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        } else {
            self.write_all(srcbuf, ctx).await
        };
-        (srcbuf, res.map(|_| (offset, compression_info)))
+        (srcbuf, res.map(|_| offset))
    }
 }

@@ -430,14 +416,12 @@ pub(crate) mod tests {
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
-                    let res = wtr
-                        .write_blob_maybe_compressed(
-                            blob.clone(),
-                            ctx,
-                            ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                        )
-                        .await;
-                    (res.0, res.1.map(|(off, _)| off))
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        ctx,
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                    )
+                    .await
                } else {
                    wtr.write_blob(blob.clone(), ctx).await
                };
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,213 +0,0 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
-
-use super::remote_timeline_client::index::GcBlockingReason;
-
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
-
-#[derive(Default)]
-pub(crate) struct GcBlock {
-    /// The timelines which have current reasons to block gc.
-    ///
-    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
-    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
-    reasons: std::sync::Mutex<Storage>,
-    blocking: tokio::sync::Mutex<()>,
-}
-
-impl GcBlock {
-    /// Start another gc iteration.
-    ///
-    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
-    /// it's ending, or if not currently possible, a value describing the reasons why not.
-    ///
-    /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
-        let reasons = {
-            let g = self.reasons.lock().unwrap();
-
-            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
-            // tests, we use everything. we should warn if the gc has been consecutively blocked
-            // for more than 1h (within single tenant session?).
-            BlockingReasons::clean_and_summarize(g)
-        };
-
-        if let Some(reasons) = reasons {
-            Err(reasons)
-        } else {
-            Ok(Guard {
-                _inner: self.blocking.lock().await,
-            })
-        }
-    }
-
-    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
-        let g = self.reasons.lock().unwrap();
-
-        BlockingReasons::summarize(&g)
-    }
-
-    /// Start blocking gc for this one timeline for the given reason.
-    ///
-    /// This is not a guard based API but instead it mimics set API. The returned future will not
-    /// resolve until an existing gc round has completed.
-    ///
-    /// Returns true if this block was new, false if gc was already blocked for this reason.
-    ///
-    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
-    /// keep the gc blocking reason.
-    pub(crate) async fn insert(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<bool> {
-        let (added, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
-            let added = set.insert(reason);
-
-            // LOCK ORDER: intentionally hold the lock, see self.reasons.
-            let uploaded = timeline
-                .remote_client
-                .schedule_insert_gc_block_reason(reason)?;
-
-            (added, uploaded)
-        };
-
-        uploaded.await?;
-
-        // ensure that any ongoing gc iteration has completed
-        drop(self.blocking.lock().await);
-
-        Ok(added)
-    }
-
-    /// Remove blocking gc for this one timeline and the given reason.
-    pub(crate) async fn remove(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<()> {
-        use std::collections::hash_map::Entry;
-
-        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
-
-        let (remaining_blocks, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
-                Entry::Occupied(mut oe) => {
-                    let set = oe.get_mut();
-                    set.remove(reason);
-                    if set.is_empty() {
-                        oe.remove();
-                    }
-                }
-                Entry::Vacant(_) => {
-                    // we must still do the index_part.json update regardless, in case we had earlier
-                    // been cancelled
-                }
-            }
-
-            let remaining_blocks = g.len();
-
-            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
-            let uploaded = timeline
-                .remote_client
-                .schedule_remove_gc_block_reason(reason)?;
-
-            (remaining_blocks, uploaded)
-        };
-        uploaded.await?;
-
-        // no need to synchronize with gc iteration again
-
-        if remaining_blocks > 0 {
-            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
-        } else {
-            tracing::info!("gc is now unblocked for the tenant");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
-        let unblocked = {
-            let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
-                return;
-            }
-
-            g.remove(&timeline.timeline_id);
-
-            BlockingReasons::clean_and_summarize(g).is_none()
-        };
-
-        if unblocked {
-            tracing::info!("gc is now unblocked following deletion");
-        }
-    }
-
-    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
-        let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
-
-        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
-            tracing::info!(summary=?reasons, "initialized with gc blocked");
-        }
-    }
-}
-
-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
-}
-
-#[derive(Debug)]
-pub(crate) struct BlockingReasons {
-    timelines: usize,
-    reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-impl std::fmt::Display for BlockingReasons {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
-        )
-    }
-}
-
-impl BlockingReasons {
-    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
-            reasons = reasons.union(*value);
-            !value.is_empty()
-        });
-        if !g.is_empty() {
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        } else {
-            None
-        }
-    }
-
-    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
-            None
-        } else {
-            let reasons = g
-                .values()
-                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,8 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
-use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
+use pageserver_api::keyspace::KeySpaceAccum;
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -62,7 +61,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;

-use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
+use super::storage_layer::PersistentLayerDesc;

 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -872,183 +871,11 @@ impl LayerMap {
        println!("End dump LayerMap");
        Ok(())
    }
-
-    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
-    /// where we expect to serve reads.
-    ///
-    /// This function is O(N) and should be called infrequently.  The caller is responsible for
-    /// looking up and updating the Layer objects for these layer descriptors.
-    pub fn get_visibility(
-        &self,
-        mut read_points: Vec<Lsn>,
-    ) -> (
-        Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
-        KeySpace,
-    ) {
-        // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
-        // KeySpace is intended to be composed statically and iterated over.
-        struct KeyShadow {
-            // Map of range start to range end
-            inner: RangeSetBlaze<i128>,
-        }
-
-        impl KeyShadow {
-            fn new() -> Self {
-                Self {
-                    inner: Default::default(),
-                }
-            }
-
-            fn contains(&self, range: Range<Key>) -> bool {
-                let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
-                self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
-                    CheckSortedDisjoint::from([range_incl]),
-                ))
-            }
-
-            /// Add the input range to the keys covered by self.
-            ///
-            /// Return true if inserting this range covered some keys that were previously not covered
-            fn cover(&mut self, insert: Range<Key>) -> bool {
-                let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
-                self.inner.ranges_insert(range_incl)
-            }
-
-            fn reset(&mut self) {
-                self.inner = Default::default();
-            }
-
-            fn to_keyspace(&self) -> KeySpace {
-                let mut accum = KeySpaceAccum::new();
-                for range_incl in self.inner.ranges() {
-                    let range = Range {
-                        start: Key::from_i128(*range_incl.start()),
-                        end: Key::from_i128(range_incl.end() + 1),
-                    };
-                    accum.add_range(range)
-                }
-
-                accum.to_keyspace()
-            }
-        }
-
-        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
-        // and a ReadPoint
-        read_points.sort_by_key(|rp| rp.0);
-        let mut shadow = KeyShadow::new();
-
-        // We will interleave all our read points and layers into a sorted collection
-        enum Item {
-            ReadPoint { lsn: Lsn },
-            Layer(Arc<PersistentLayerDesc>),
-        }
-
-        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
-        items.extend(self.iter_historic_layers().map(Item::Layer));
-        items.extend(
-            read_points
-                .into_iter()
-                .map(|rp| Item::ReadPoint { lsn: rp }),
-        );
-
-        // Ordering: we want to iterate like this:
-        // 1. Highest LSNs first
-        // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
-        // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
-        items.sort_by_key(|item| {
-            std::cmp::Reverse(match item {
-                Item::Layer(layer) => {
-                    if layer.is_delta() {
-                        (Lsn(layer.get_lsn_range().end.0 - 1), 0)
-                    } else {
-                        (layer.image_layer_lsn(), 1)
-                    }
-                }
-                Item::ReadPoint { lsn } => (*lsn, 2),
-            })
-        });
-
-        let mut results = Vec::with_capacity(self.historic.len());
-
-        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
-
-        for item in items {
-            let (reached_lsn, is_readpoint) = match &item {
-                Item::ReadPoint { lsn } => (lsn, true),
-                Item::Layer(layer) => (&layer.lsn_range.start, false),
-            };
-            maybe_covered_deltas.retain(|d| {
-                if *reached_lsn >= d.lsn_range.start && is_readpoint {
-                    // We encountered a readpoint within the delta layer: it is visible
-
-                    results.push((d.clone(), LayerVisibilityHint::Visible));
-                    false
-                } else if *reached_lsn < d.lsn_range.start {
-                    // We passed the layer's range without encountering a read point: it is not visible
-                    results.push((d.clone(), LayerVisibilityHint::Covered));
-                    false
-                } else {
-                    // We're still in the delta layer: continue iterating
-                    true
-                }
-            });
-
-            match item {
-                Item::ReadPoint { lsn: _lsn } => {
-                    // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
-                    // to assume that the whole key range is visible at the branch point.
-                    shadow.reset();
-                }
-                Item::Layer(layer) => {
-                    let visibility = if layer.is_delta() {
-                        if shadow.contains(layer.get_key_range()) {
-                            // If a layer isn't visible based on current state, we must defer deciding whether
-                            // it is truly not visible until we have advanced past the delta's range: we might
-                            // encounter another branch point within this delta layer's LSN range.
-                            maybe_covered_deltas.push(layer);
-                            continue;
-                        } else {
-                            LayerVisibilityHint::Visible
-                        }
-                    } else {
-                        let modified = shadow.cover(layer.get_key_range());
-                        if modified {
-                            // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
-                            LayerVisibilityHint::Visible
-                        } else {
-                            // An image layer in a region that was already covered
-                            LayerVisibilityHint::Covered
-                        }
-                    };
-
-                    results.push((layer, visibility));
-                }
-            }
-        }
-
-        // Drain any remaining maybe_covered deltas
-        results.extend(
-            maybe_covered_deltas
-                .into_iter()
-                .map(|d| (d, LayerVisibilityHint::Covered)),
-        );
-
-        (results, shadow.to_keyspace())
-    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::tenant::{storage_layer::LayerName, IndexPart};
-    use pageserver_api::{
-        key::DBDIR_KEY,
-        keyspace::{KeySpace, KeySpaceRandomAccum},
-    };
-    use std::{collections::HashMap, path::PathBuf};
-    use utils::{
-        id::{TenantId, TimelineId},
-        shard::TenantShardId,
-    };
+    use pageserver_api::keyspace::KeySpace;

    use super::*;

@@ -1175,299 +1002,4 @@ mod tests {
            }
        }
    }
-
-    #[test]
-    fn layer_visibility_basic() {
-        // A simple synthetic input, as a smoke test.
-        let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
-        let timeline_id = TimelineId::generate();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-
-        const FAKE_LAYER_SIZE: u64 = 1024;
-
-        let inject_delta = |updates: &mut BatchedUpdates,
-                            key_start: i128,
-                            key_end: i128,
-                            lsn_start: u64,
-                            lsn_end: u64| {
-            let desc = PersistentLayerDesc::new_delta(
-                tenant_shard_id,
-                timeline_id,
-                Range {
-                    start: Key::from_i128(key_start),
-                    end: Key::from_i128(key_end),
-                },
-                Range {
-                    start: Lsn(lsn_start),
-                    end: Lsn(lsn_end),
-                },
-                1024,
-            );
-            updates.insert_historic(desc.clone());
-            desc
-        };
-
-        let inject_image =
-            |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
-                let desc = PersistentLayerDesc::new_img(
-                    tenant_shard_id,
-                    timeline_id,
-                    Range {
-                        start: Key::from_i128(key_start),
-                        end: Key::from_i128(key_end),
-                    },
-                    Lsn(lsn),
-                    FAKE_LAYER_SIZE,
-                );
-                updates.insert_historic(desc.clone());
-                desc
-            };
-
-        //
-        // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
-        // we expect to handle.  You can follow these examples through in the same order as they would be processed
-        // by the function under test.
-        //
-
-        let mut read_points = vec![Lsn(1000)];
-
-        // A delta ahead of any image layer
-        let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
-
-        // An image layer is visible and covers some layers beneath itself
-        let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
-
-        // A delta layer covered by the image layer: should be covered
-        let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
-
-        // A delta layer partially covered by an image layer: should be visible
-        let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
-
-        // A delta layer not covered by an image layer: should be visible
-        let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
-
-        // An image layer covered by the image layer above: should be covered
-        let covered_image = inject_image(&mut updates, 10, 20, 89);
-
-        // An image layer partially covered by an image layer: should be visible
-        let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
-
-        // An image layer not covered by an image layer: should be visible
-        let not_covered_image = inject_image(&mut updates, 1, 4, 89);
-
-        // A read point: this will make subsequent layers below here visible, even if there are
-        // more recent layers covering them.
-        read_points.push(Lsn(80));
-
-        // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
-        let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
-
-        // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
-        // the read point should make it visible, even though its end LSN is covered
-        let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
-        let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
-        read_points.push(Lsn(65));
-        let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
-
-        let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
-
-        updates.flush();
-
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        assert_eq!(
-            layer_visibilities.get(&ahead_layer),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_covering_img),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_image),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_below_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covering_img_between_read_points),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_between_read_points),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_intersects_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_img_after_last_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-
-        // Shadow should include all the images below the last read point
-        let expected_shadow = KeySpace {
-            ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
-        };
-        assert_eq!(shadow, expected_shadow);
-    }
-
-    fn fixture_path(relative: &str) -> PathBuf {
-        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-    }
-
-    #[test]
-    fn layer_visibility_realistic() {
-        // Load a large example layermap
-        let index_raw = std::fs::read_to_string(fixture_path(
-            "test_data/indices/mixed_workload/index_part.json",
-        ))
-        .unwrap();
-        let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
-
-        let tenant_id = TenantId::generate();
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-        let timeline_id = TimelineId::generate();
-
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for (layer_name, layer_metadata) in index.layer_metadata {
-            let layer_desc = match layer_name {
-                LayerName::Image(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range.clone(),
-                    lsn_range: layer_name.lsn_as_range(),
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: false,
-                    file_size: layer_metadata.file_size,
-                },
-                LayerName::Delta(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range,
-                    lsn_range: layer_name.lsn_range,
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: true,
-                    file_size: layer_metadata.file_size,
-                },
-            };
-            updates.insert_historic(layer_desc);
-        }
-        updates.flush();
-
-        let read_points = vec![index.metadata.disk_consistent_lsn()];
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        for (layer_desc, visibility) in &layer_visibilities {
-            tracing::info!("{layer_desc:?}: {visibility:?}");
-            eprintln!("{layer_desc:?}: {visibility:?}");
-        }
-
-        // The shadow should be non-empty, since there were some image layers
-        assert!(!shadow.ranges.is_empty());
-
-        // At least some layers should be marked covered
-        assert!(layer_visibilities
-            .iter()
-            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
-
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
-        for (layer_desc, visible) in &layer_visibilities {
-            let mut coverage = KeySpaceRandomAccum::new();
-            let mut covered_by = Vec::new();
-
-            for other_layer in layer_map.iter_historic_layers() {
-                if &other_layer == layer_desc {
-                    continue;
-                }
-                if !other_layer.is_delta()
-                    && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
-                    && other_layer.key_range.start <= layer_desc.key_range.end
-                    && layer_desc.key_range.start <= other_layer.key_range.end
-                {
-                    coverage.add_range(other_layer.get_key_range());
-                    covered_by.push((*other_layer).clone());
-                }
-            }
-            let coverage = coverage.to_keyspace();
-
-            let expect_visible = if coverage.ranges.len() == 1
-                && coverage.contains(&layer_desc.key_range.start)
-                && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
-            {
-                LayerVisibilityHint::Covered
-            } else {
-                LayerVisibilityHint::Visible
-            };
-
-            if expect_visible != *visible {
-                eprintln!(
-                    "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
-                    layer_desc.key_range.start,
-                    layer_desc.key_range.end,
-                    layer_desc.lsn_range.start,
-                    layer_desc.lsn_range.end,
-                    layer_desc.is_delta()
-                );
-                if expect_visible == LayerVisibilityHint::Covered {
-                    eprintln!("Covered by:");
-                    for other in covered_by {
-                        eprintln!(
-                            "  {}..{} @ {}",
-                            other.get_key_range().start,
-                            other.get_key_range().end,
-                            other.image_layer_lsn()
-                        );
-                    }
-                    if let Some(range) = coverage.ranges.first() {
-                        eprintln!(
-                            "Total coverage from contributing layers: {}..{}",
-                            range.start, range.end
-                        );
-                    } else {
-                        eprintln!(
-                            "Total coverage from contributing layers: {:?}",
-                            coverage.ranges
-                        );
-                    }
-                }
-            }
-            assert_eq!(expect_visible, *visible);
-        }
-
-        // Sanity: the layer that holds latest data for the DBDIR key should always be visible
-        // (just using this key as a key that will always exist for any layermap fixture)
-        let dbdir_layer = layer_map
-            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
-            .unwrap();
-        assert!(matches!(
-            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
-            LayerVisibilityHint::Visible
-        ));
-    }
 }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,10 +521,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {

        Ok(&self.historic_coverage)
    }
-
-    pub(crate) fn len(&self) -> usize {
-        self.layers.len()
-    }
 }

 #[test]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
-use super::{GlobalShutDown, TenantSharedResources};
+use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
    /// Pick the shard that holds this key
    Page(Key),
    /// The shard ID is known: pick the given shard
@@ -665,20 +667,17 @@ pub async fn init_tenant_mgr(
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
-                tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                )
-                .expect("global shutdown during init_tenant_mgr cannot happen"),
-            ),
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                &ctx,
+            )),
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -726,7 +725,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Arc<Tenant> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -1193,10 +1192,7 @@ impl TenantManager {
                    None,
                    spawn_mode,
                    ctx,
-                )
-                .map_err(|_: GlobalShutDown| {
-                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
-                })?;
+                );

                TenantSlot::Attached(tenant)
            }
@@ -1317,7 +1313,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        )?;
+        );

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -1388,32 +1384,34 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
    ) -> Result<(), DeleteTenantError> {
        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            Some(&remote_path),
-            remote_storage::ListingMode::NoDelimiter,
-            None,
-            &self.cancel,
-        );
-        while let Some(chunk) = keys_stream.next().await {
-            let keys = match chunk {
-                Ok(listing) => listing.keys,
-                Err(remote_storage::DownloadError::Cancelled) => {
-                    return Err(DeleteTenantError::Cancelled)
-                }
-                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            };
-
-            if keys.is_empty() {
-                tracing::info!("Remote storage already deleted");
-            } else {
-                tracing::info!("Deleting {} keys from remote storage", keys.len());
-                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-                self.resources
-                    .remote_storage
-                    .delete_objects(&keys, &self.cancel)
-                    .await?;
+        let keys = match self
+            .resources
+            .remote_storage
+            .list(
+                Some(&remote_path),
+                remote_storage::ListingMode::NoDelimiter,
+                None,
+                &self.cancel,
+            )
+            .await
+        {
+            Ok(listing) => listing.keys,
+            Err(remote_storage::DownloadError::Cancelled) => {
+                return Err(DeleteTenantError::Cancelled)
            }
+            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+        };
+
+        if keys.is_empty() {
+            tracing::info!("Remote storage already deleted");
+        } else {
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+            self.resources
+                .remote_storage
+                .delete_objects(&keys, &self.cancel)
+                .await?;
        }

        Ok(())
@@ -2051,7 +2049,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        )?;
+        );

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -2092,6 +2090,7 @@ impl TenantManager {
                    };

                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return ShardResolveResult::Found(tenant.clone())
                        }
@@ -2173,9 +2172,6 @@ pub(crate) enum GetActiveTenantError {
    /// never happen.
    #[error("Tenant is broken: {0}")]
    Broken(String),
-
-    #[error("reconnect to switch tenant id")]
-    SwitchedTenant,
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,123 +800,6 @@ impl RemoteTimelineClient {
            .context("wait completion")
    }

-    /// Adds a gc blocking reason for this timeline if one does not exist already.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_insert_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
-                    drop(guard);
-                    panic!("cannot start detach ancestor if there is nothing to detach from");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                // Usual case: !wanted(x) && !wanted(y)
-                //
-                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
-                // turn on and off some reason.
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        // this could be avoided by having external in-memory synchronization, like
-                        // timeline detach ancestor
-                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
-                    }
-
-                    // at this point, the metadata must always show that there is a parent
-                    upload_queue.dirty.gc_blocking = current
-                        .map(|x| x.with_reason(reason))
-                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
-    }
-
-    /// Removes a gc blocking reason for this timeline if one exists.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_remove_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue
-                    .clean
-                    .0
-                    .lineage
-                    .is_detached_from_original_ancestor()
-                {
-                    drop(guard);
-                    panic!("cannot complete timeline_ancestor_detach while not detached");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| {
-                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
-            };
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
-                    }
-
-                    upload_queue.dirty.gc_blocking =
-                        current.as_ref().and_then(|x| x.without_reason(reason));
-                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
-                    // FIXME: bogus ?
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
-    }
-
    /// Launch an upload operation in the background; the file is added to be included in next
    /// `index_part.json` upload.
    pub(crate) fn schedule_layer_file_upload(
@@ -1495,18 +1378,6 @@ impl RemoteTimelineClient {
                .dirty
                .layer_metadata
                .drain()
-                .filter(|(_file_name, meta)| {
-                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
-                    // all shards anyway, we _could_ delete these, but
-                    // - it creates a potential race if other shards are still
-                    //   using the layers while this shard deletes them.
-                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
-                    //   these timelines are present but corrupt (their index exists but some layers don't)
-                    //
-                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
-                    meta.shard.shard_number == self.tenant_shard_id.shard_number
-                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
-                })
                .map(|(file_name, meta)| {
                    remote_layer_path(
                        &self.tenant_shard_id.tenant_id,
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,9 +60,6 @@ pub struct IndexPart {
    #[serde(default)]
    pub(crate) lineage: Lineage,

-    #[serde(skip_serializing_if = "Option::is_none", default)]
-    pub(crate) gc_blocking: Option<GcBlocking>,
-
    /// Describes the kind of aux files stored in the timeline.
    ///
    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -88,11 +85,10 @@ impl IndexPart {
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
    /// - 8: added `archived_at`
-    /// - 9: +gc_blocking
-    const LATEST_VERSION: usize = 9;
+    const LATEST_VERSION: usize = 8;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -105,7 +101,6 @@ impl IndexPart {
            deleted_at: None,
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        }
    }
@@ -256,64 +251,6 @@ impl Lineage {
    }
 }

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub(crate) struct GcBlocking {
-    pub(crate) started_at: NaiveDateTime,
-    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
-#[enumset(serialize_repr = "list")]
-pub(crate) enum GcBlockingReason {
-    Manual,
-    DetachAncestor,
-}
-
-impl GcBlocking {
-    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
-        GcBlocking {
-            started_at: chrono::Utc::now().naive_utc(),
-            reasons: enumset::EnumSet::only(reason),
-        }
-    }
-
-    /// Returns true if the given reason is one of the reasons why the gc is blocked.
-    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
-        self.reasons.contains(reason)
-    }
-
-    /// Returns a version of self with the given reason.
-    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
-        assert!(!self.blocked_by(reason));
-        let mut reasons = self.reasons;
-        reasons.insert(reason);
-
-        Self {
-            started_at: self.started_at,
-            reasons,
-        }
-    }
-
-    /// Returns a version of self without the given reason. Assumption is that if
-    /// there are no more reasons, we can unblock the gc by returning `None`.
-    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
-        assert!(self.blocked_by(reason));
-
-        if self.reasons.len() == 1 {
-            None
-        } else {
-            let mut reasons = self.reasons;
-            assert!(reasons.remove(reason));
-            assert!(!reasons.is_empty());
-
-            Some(Self {
-                started_at: self.started_at,
-                reasons,
-            })
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -355,7 +292,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -399,7 +335,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -444,7 +379,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -492,7 +426,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -535,7 +468,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -581,7 +513,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -632,7 +563,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

@@ -688,7 +618,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -745,7 +674,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -753,68 +681,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v9_indexpart_is_parsed() {
-        let example = r#"{
-            "version": 9,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata": {
-                "disk_consistent_lsn": "0/16960E8",
-                "prev_record_lsn": "0/1696070",
-                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/1696070",
-                "initdb_lsn": "0/1696070",
-                "pg_version": 14
-            },
-            "gc_blocking": {
-                "started_at": "2024-07-19T09:00:00.123",
-                "reasons": ["DetachAncestor"]
-            }
-        }"#;
-
-        let expected = IndexPart {
-            version: 9,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::new(
-                Lsn::from_str("0/16960E8").unwrap(),
-                Some(Lsn::from_str("0/1696070").unwrap()),
-                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
-                Lsn::INVALID,
-                Lsn::from_str("0/1696070").unwrap(),
-                Lsn::from_str("0/1696070").unwrap(),
-                14,
-            ).with_recalculated_checksum().unwrap(),
-            deleted_at: None,
-            lineage: Default::default(),
-            gc_blocking: Some(GcBlocking {
-                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
-                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
-            }),
-            last_aux_file_policy: Default::default(),
-            archived_at: None,
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,9 +8,6 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-#[cfg(test)]
-pub mod split_writer;
-
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
@@ -435,18 +432,39 @@ impl ReadableLayer {
    }
 }

+/// Return value from [`Layer::get_value_reconstruct_data`]
+#[derive(Clone, Copy, Debug)]
+pub enum ValueReconstructResult {
+    /// Got all the data needed to reconstruct the requested page
+    Complete,
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue,
+
+    /// This layer didn't contain data needed to reconstruct the page version at
+    /// the returned LSN. This is usually considered an error, but might be OK
+    /// in some circumstances.
+    Missing,
+}
+
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum LayerVisibilityHint {
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
+pub(crate) enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
    /// and a readable LSN (the tip of the branch or a child's branch point)
    Visible,
    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
+    #[allow(unused)]
    Covered,
+    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
+    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
+    /// state is for when existing layers are constructed while loading a timeline.
+    #[default]
+    Uninitialized,
 }

 pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -539,25 +557,19 @@ impl LayerAccessStats {
        self.record_residence_event_at(SystemTime::now())
    }

-    fn record_access_at(&self, now: SystemTime) -> bool {
+    pub(crate) fn record_access_at(&self, now: SystemTime) {
        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);

        // A layer which is accessed must be visible.
        mask |= 0x1 << Self::VISIBILITY_SHIFT;
        value |= 0x1 << Self::VISIBILITY_SHIFT;

-        let old_bits = self.write_bits(mask, value);
-        !matches!(
-            self.decode_visibility(old_bits),
-            LayerVisibilityHint::Visible
-        )
+        self.write_bits(mask, value);
    }

-    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
-    /// as a result of this access
-    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
+    pub(crate) fn record_access(&self, ctx: &RequestContext) {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return false;
+            return;
        }

        self.record_access_at(SystemTime::now())
@@ -614,29 +626,22 @@ impl LayerAccessStats {
        }
    }

-    /// Helper for extracting the visibility hint from the literal value of our inner u64
-    fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
-        match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
-            1 => LayerVisibilityHint::Visible,
-            0 => LayerVisibilityHint::Covered,
-            _ => unreachable!(),
-        }
-    }
-
-    /// Returns the old value which has been replaced
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
        let value = match visibility {
            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
-            LayerVisibilityHint::Covered => 0x0,
+            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
        };

-        let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
-        self.decode_visibility(old_bits)
+        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
    }

    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-        self.decode_visibility(read)
+        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+            1 => LayerVisibilityHint::Visible,
+            0 => LayerVisibilityHint::Covered,
+            _ => unreachable!(),
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,12 +36,13 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -71,7 +72,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};

 ///
 /// Header stored in the beginning of the file
@@ -196,6 +200,7 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
+    access_stats: LayerAccessStats,
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -294,6 +299,7 @@ impl DeltaLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
+        self.access_stats.record_access(ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -344,6 +350,7 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -366,6 +373,7 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
+    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
@@ -376,9 +384,6 @@ struct DeltaLayerWriterInner {
    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

    blob_writer: BlobWriter<true>,
-
-    // Number of key-lsns in the layer.
-    num_keys: usize,
 }

 impl DeltaLayerWriterInner {
@@ -412,6 +417,7 @@ impl DeltaLayerWriterInner {
        let tree_builder = DiskBtreeBuilder::new(block_buf);

        Ok(Self {
+            conf,
            path,
            timeline_id,
            tenant_shard_id,
@@ -419,7 +425,6 @@ impl DeltaLayerWriterInner {
            lsn_range,
            tree: tree_builder,
            blob_writer,
-            num_keys: 0,
        })
    }

@@ -462,7 +467,7 @@ impl DeltaLayerWriterInner {
            .write_blob_maybe_compressed(val, ctx, compression)
            .await;
        let off = match res {
-            Ok((off, _)) => off,
+            Ok(off) => off,
            Err(e) => return (val, Err(anyhow::anyhow!(e))),
        };

@@ -470,9 +475,6 @@ impl DeltaLayerWriterInner {

        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
        let res = self.tree.append(&delta_key.0, blob_ref.0);
-
-        self.num_keys += 1;
-
        (val, res.map_err(|e| anyhow::anyhow!(e)))
    }

@@ -486,10 +488,11 @@ impl DeltaLayerWriterInner {
    async fn finish(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, ctx).await;
+        let result = self.finish0(key_end, timeline, ctx).await;
        if result.is_err() {
            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
            if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -502,8 +505,9 @@ impl DeltaLayerWriterInner {
    async fn finish0(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -568,9 +572,11 @@ impl DeltaLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        trace!("created delta layer {}", self.path);
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        Ok((desc, self.path))
+        trace!("created delta layer {}", layer.local_path());
+
+        Ok(layer)
    }
 }

@@ -671,20 +677,14 @@ impl DeltaLayerWriter {
    pub(crate) async fn finish(
        mut self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(key_end, ctx).await
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
-    #[cfg(test)]
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(key_end, timeline, ctx)
+            .await
    }
 }

@@ -808,6 +808,95 @@ impl DeltaLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let mut need_image = true;
+        // Scan the page versions backwards, starting from `lsn`.
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            &block_reader,
+        );
+        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+        tree_reader
+            .visit(
+                &search_key.0,
+                VisitDirection::Backwards,
+                |key, value| {
+                    let blob_ref = BlobRef(value);
+                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                        return false;
+                    }
+                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                    if entry_lsn < lsn_range.start {
+                        return false;
+                    }
+                    offsets.push((entry_lsn, blob_ref.pos()));
+
+                    !blob_ref.will_init()
+                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
+            )
+            .await?;
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        // Ok, 'offsets' now contains the offsets of all the entries we need to read
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        for (entry_lsn, pos) in offsets {
+            cursor
+                .read_blob_into_buf(pos, &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            match val {
+                Value::Image(img) => {
+                    reconstruct_state.img = Some((entry_lsn, img));
+                    need_image = false;
+                    break;
+                }
+                Value::WalRecord(rec) => {
+                    let will_init = rec.will_init();
+                    reconstruct_state.records.push((entry_lsn, rec));
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_image = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -1580,9 +1669,8 @@ pub(crate) mod test {
    use super::*;
    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
-    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::Tenant;
    use crate::{
        context::DownloadBehavior,
        task_mgr::TaskKind,
@@ -1876,8 +1964,9 @@ pub(crate) mod test {
            res?;
        }

-        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
-        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
+        let resident = writer
+            .finish(entries_meta.key_range.end, &timeline, &ctx)
+            .await?;

        let inner = resident.get_as_delta(&ctx).await?;

@@ -2066,8 +2155,7 @@ pub(crate) mod test {
                .await
                .unwrap();

-            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
-            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
+            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();

            copied_layer.get_as_delta(ctx).await.unwrap();

@@ -2195,9 +2283,7 @@ pub(crate) mod test {
        for (key, lsn, value) in deltas {
            writer.put_value(key, lsn, value, ctx).await?;
        }
-
-        let (desc, path) = writer.finish(key_end, ctx).await?;
-        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
+        let delta_layer = writer.finish(key_end, tline, ctx).await?;

        Ok::<_, anyhow::Error>(delta_layer)
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,6 +32,9 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -134,6 +137,7 @@ pub struct ImageLayer {
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+    access_stats: LayerAccessStats,
    inner: OnceCell<ImageLayerInner>,
 }

@@ -251,6 +255,7 @@ impl ImageLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -301,6 +306,7 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -423,6 +429,46 @@ impl ImageLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader
+            .get(
+                &keybuf,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                    .build(),
+            )
+            .await?
+        {
+            let blob = block_reader
+                .block_cursor()
+                .read_blob(
+                    offset,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerValue)
+                        .build(),
+                )
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    pub(super) async fn get_values_reconstruct_data(
@@ -688,29 +734,11 @@ struct ImageLayerWriterInner {
    // Total uncompressed bytes passed into put_image
    uncompressed_bytes: u64,

-    // Like `uncompressed_bytes`,
-    // but only of images we might consider for compression
-    uncompressed_bytes_eligible: u64,
-
-    // Like `uncompressed_bytes`, but only of images
-    // where we have chosen their compressed form
-    uncompressed_bytes_chosen: u64,
-
-    // Number of keys in the layer.
-    num_keys: usize,
-
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
-
-    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
-    last_written_key: Key,
 }

 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
    ///
    /// Start building a new image layer.
    ///
@@ -762,10 +790,6 @@ impl ImageLayerWriterInner {
            tree: tree_builder,
            blob_writer,
            uncompressed_bytes: 0,
-            uncompressed_bytes_eligible: 0,
-            uncompressed_bytes_chosen: 0,
-            num_keys: 0,
-            last_written_key: Key::MIN,
        };

        Ok(writer)
@@ -784,33 +808,18 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let compression = self.conf.image_compression;
-        let uncompressed_len = img.len() as u64;
-        self.uncompressed_bytes += uncompressed_len;
-        self.num_keys += 1;
+        self.uncompressed_bytes += img.len() as u64;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
            .await;
        // TODO: re-use the buffer for `img` further upstack
-        let (off, compression_info) = res?;
-        if compression_info.compressed_size.is_some() {
-            // The image has been considered for compression at least
-            self.uncompressed_bytes_eligible += uncompressed_len;
-        }
-        if compression_info.written_compressed {
-            // The image has been compressed
-            self.uncompressed_bytes_chosen += uncompressed_len;
-        }
+        let off = res?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
        self.tree.append(&keybuf, off)?;

-        #[cfg(feature = "testing")]
-        {
-            self.last_written_key = key;
-        }
-
        Ok(())
    }

@@ -821,7 +830,6 @@ impl ImageLayerWriterInner {
        self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        end_key: Option<Key>,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -829,9 +837,6 @@ impl ImageLayerWriterInner {
        // Calculate compression ratio
        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
-            .inc_by(self.uncompressed_bytes_eligible);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);

        let mut file = self.blob_writer.into_inner();
@@ -872,23 +877,11 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            self.key_range.clone(),
            self.lsn,
            metadata.len(),
        );

-        #[cfg(feature = "testing")]
-        if let Some(end_key) = end_key {
-            assert!(
-                self.last_written_key < end_key,
-                "written key violates end_key range"
-            );
-        }
-
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -965,18 +958,6 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

-    #[cfg(test)]
-    /// Estimated size of the image layer.
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
    ///
    /// Finish writing the image layer.
    ///
@@ -985,26 +966,7 @@ impl ImageLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
-    }
-
-    #[cfg(test)]
-    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
-    pub(super) async fn finish_with_end_key(
-        mut self,
-        timeline: &Arc<Timeline>,
-        end_key: Key,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
-    }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
+        self.inner.take().unwrap().finish(timeline, ctx).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
+use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, Result};
-use camino::Utf8PathBuf;
+use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -34,7 +34,8 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
 };

 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
@@ -54,6 +55,9 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    pub(crate) end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
    frozen_local_path_str: OnceLock<Arc<str>>,

@@ -244,6 +248,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -293,6 +303,60 @@ impl InMemoryLayer {
        Ok(())
    }

+    /// Look up given value in the layer.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.start_lsn);
+        let mut need_image = true;
+
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+
+        let reader = inner.file.block_cursor();
+
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, pos) in slice.iter().rev() {
+                let buf = reader.read_blob(*pos, &ctx).await?;
+                let value = Value::des(&buf)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        // release lock on 'inner'
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -394,6 +458,11 @@ impl InMemoryLayer {

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
@@ -413,7 +482,8 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+
+    pub(crate) async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
@@ -478,6 +548,8 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;
+
        assert!(
            self.start_lsn < end_lsn,
            "{} >= {}",
@@ -495,13 +567,9 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        #[cfg(debug_assertions)]
-        {
-            let inner = self.inner.write().await;
-            for vec_map in inner.index.values() {
-                for (lsn, _pos) in vec_map.as_slice() {
-                    assert!(*lsn < end_lsn);
-                }
+        for vec_map in inner.index.values() {
+            for (lsn, _pos) in vec_map.as_slice() {
+                assert!(*lsn < end_lsn);
            }
        }
    }
@@ -511,12 +579,12 @@ impl InMemoryLayer {
    /// if there are no matching keys.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub async fn write_to_disk(
+    pub(crate) async fn write_to_disk(
        &self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
-        l0_flush_global_state: &l0_flush::Inner,
-    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
+    ) -> Result<Option<ResidentLayer>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -528,8 +596,9 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;

+        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
        use l0_flush::Inner;
-        let _concurrency_permit = match l0_flush_global_state {
+        let _concurrency_permit = match &*l0_flush_global_state {
            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };
@@ -559,7 +628,7 @@ impl InMemoryLayer {
        )
        .await?;

-        match l0_flush_global_state {
+        match &*l0_flush_global_state {
            l0_flush::Inner::PageCached => {
                let ctx = RequestContextBuilder::extend(ctx)
                    .page_content_kind(PageContentKind::InMemoryLayer)
@@ -624,7 +693,7 @@ impl InMemoryLayer {
        }

        // MAX is used here because we identify L0 layers by full key range
-        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;

        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
        //
@@ -636,6 +705,6 @@ impl InMemoryLayer {
        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
        drop(_concurrency_permit);

-        Ok(Some((desc, path)))
+        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -246,7 +246,7 @@ impl Layer {
                &timeline.generation,
            );

-            LayerInner::new(
+            let layer = LayerInner::new(
                conf,
                timeline,
                local_path,
@@ -254,7 +254,14 @@ impl Layer {
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
-            )
+            );
+
+            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
+            layer
+                .access_stats
+                .set_visibility(super::LayerVisibilityHint::Visible);
+
+            layer
        }));

        let downloaded = resident.expect("just initialized");
@@ -300,6 +307,42 @@ impl Layer {
        self.0.delete_on_drop();
    }

+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from the previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use anyhow::ensure;
+
+        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
+        self.0.access_stats.record_access(ctx);
+
+        if self.layer_desc().is_delta {
+            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
+            ensure!(self.layer_desc().key_range.contains(&key));
+        } else {
+            ensure!(self.layer_desc().key_range.contains(&key));
+            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
+            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
+        }
+
+        layer
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
+            .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
+    }
+
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -316,7 +359,7 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.record_access(ctx);
+        self.0.access_stats.record_access(ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,18 +439,18 @@ impl Layer {
        self.0.info(reset)
    }

-    pub(crate) fn latest_activity(&self) -> SystemTime {
-        self.0.access_stats.latest_activity()
-    }
-
-    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        self.0.access_stats.visibility()
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.0.access_stats
    }

    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.0.path
    }

+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -450,57 +493,13 @@ impl Layer {
            }
        }
    }
-
-    fn record_access(&self, ctx: &RequestContext) {
-        if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
-            if let Some(tl) = self.0.timeline.upgrade() {
-                tl.metrics
-                    .visible_physical_size_gauge
-                    .add(self.0.desc.file_size)
-            }
-        }
-    }
-
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
-        use LayerVisibilityHint::*;
-        match (old_visibility, visibility) {
-            (Visible, Covered) => {
-                // Subtract this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    debug_assert!(
-                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
-                    );
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .sub(self.0.desc.file_size)
-                }
-            }
-            (Covered, Visible) => {
-                // Add this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .add(self.0.desc.file_size)
-                }
-            }
-            (Covered, Covered) | (Visible, Visible) => {
-                // no change
-            }
-        }
-    }
 }

 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_values_reconstruct_data`].
+/// read with [`Layer::get_value_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -581,6 +580,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -691,16 +693,6 @@ impl Drop for LayerInner {
                timeline.metrics.layer_count_image.dec();
                timeline.metrics.layer_size_image.sub(self.desc.file_size);
            }
-
-            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
-                debug_assert!(
-                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
-                );
-                timeline
-                    .metrics
-                    .visible_physical_size_gauge
-                    .sub(self.desc.file_size);
-            }
        }

        if !*self.wanted_deleted.get_mut() {
@@ -809,14 +801,11 @@ impl LayerInner {
            timeline.metrics.layer_size_image.add(desc.file_size);
        }

-        // New layers are visible by default. This metric is later updated on drop or in set_visibility
-        timeline
-            .metrics
-            .visible_physical_size_gauge
-            .add(desc.file_size);
-
        LayerInner {
            conf,
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1737,6 +1726,28 @@ impl DownloadedLayer {
            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
    }

+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => {
+                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
+                    .await
+            }
+        }
+    }
+
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -1835,7 +1846,7 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                self.owner.record_access(ctx);
+                owner.access_stats.record_access(ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -50,26 +50,13 @@ async fn smoke_test() {
    // all layers created at pageserver are like `layer`, initialized with strong
    // Arc<DownloadedLayer>.

-    let controlfile_keyspace = KeySpace {
-        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
-    };
-
    let img_before = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
+        data.img
            .take()
            .expect("tenant harness writes the control file")
    };
@@ -87,24 +74,13 @@ async fn smoke_test() {

    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .instrument(download_span.clone())
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
-            .take()
-            .expect("tenant harness writes the control file")
+        data.img.take().unwrap()
    };

    assert_eq!(img_before, img_after);
@@ -854,7 +830,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
    assert_eq!(size_of::<LayerAccessStats>(), 8);
    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 296);
+    assert_eq!(size_of::<LayerInner>(), 312);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -41,20 +41,6 @@ pub struct PersistentLayerKey {
    pub is_delta: bool,
 }

-impl std::fmt::Display for PersistentLayerKey {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}..{} {}..{} is_delta={}",
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta
-        )
-    }
-}
-
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,454 +0,0 @@
-use std::{ops::Range, sync::Arc};
-
-use bytes::Bytes;
-use pageserver_api::key::{Key, KEY_SIZE};
-use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
-
-use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
-
-use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
-
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
-#[must_use]
-pub struct SplitImageLayerWriter {
-    inner: ImageLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn: Lsn,
-}
-
-impl SplitImageLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn: Lsn,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: ImageLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                &(start_key..Key::MAX),
-                lsn,
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn,
-        })
-    }
-
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is an upper bound of the space that the key/image could take
-        // because we did not consider compression in this estimation. The resulting image layer
-        // could be smaller than the target size.
-        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_image_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                &(key..Key::MAX),
-                self.lsn,
-                ctx,
-            )
-            .await?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
-        }
-        self.inner.put_image(key, img, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
-#[must_use]
-pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn_range: Range<Lsn>,
-}
-
-impl SplitDeltaLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn_range: Range<Lsn>,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn_range,
-        })
-    }
-
-    pub async fn put_value(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_delta_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                key,
-                self.lsn_range.clone(),
-                ctx,
-            )
-            .await?;
-            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            self.generated_layers.push(delta_layer);
-        }
-        self.inner.put_value(key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-
-        let (desc, path) = inner.finish(end_key, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-        generated_layers.push(delta_layer);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::AsLayerDesc,
-        },
-        DEFAULT_PG_VERSION,
-    };
-
-    use super::*;
-
-    fn get_key(id: u32) -> Key {
-        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        key.field6 = id;
-        key
-    }
-
-    fn get_img(id: u32) -> Bytes {
-        format!("{id:064}").into()
-    }
-
-    fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
-    }
-
-    #[tokio::test]
-    async fn write_one_image() {
-        let harness = TenantHarness::create("split_writer_write_one_image")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-    }
-
-    #[tokio::test]
-    async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        const N: usize = 2000;
-        for i in 0..N {
-            let i = i as u32;
-            image_writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
-                .await
-                .unwrap();
-            delta_writer
-                .put_value(
-                    get_key(i),
-                    Lsn(0x20),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                )
-                .await
-                .unwrap();
-        }
-        let image_layers = image_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].layer_desc().key_range.end,
-                    image_layers[idx].layer_desc().key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].layer_desc().key_range.end,
-                    delta_layers[idx].layer_desc().key_range.start
-                );
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn write_large_img() {
-        let harness = TenantHarness::create("split_writer_write_large_img")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        image_writer
-            .put_image(get_key(1), get_large_img(), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        delta_writer
-            .put_value(
-                get_key(1),
-                Lsn(0x1A),
-                Value::Image(get_large_img()),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -407,16 +407,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        error_run_count += 1;
                        let wait_duration = Duration::from_secs_f64(wait_duration);

-                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
-                            // Timeline was cancelled during gc. We might either be in an event
-                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
-                            // or in one that affects the timeline only (timeline deletion).
-                            // Therefore, don't exit the loop.
-                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        } else {
-                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        }
-
+                        error!(
+                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    );
                        wait_duration
                    }
                }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,7 +3,6 @@ pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
-pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -18,12 +17,11 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
-use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -59,7 +57,10 @@ use std::{
    collections::{BTreeMap, HashMap, HashSet},
    sync::atomic::AtomicU64,
 };
-use std::{cmp::min, cmp::Ordering, ops::ControlFlow};
+use std::{
+    cmp::{max, min, Ordering},
+    ops::ControlFlow,
+};
 use std::{
    collections::btree_map::Entry,
    ops::{Deref, Range},
@@ -73,7 +74,6 @@ use crate::{
        metadata::TimelineMetadata,
        storage_layer::PersistentLayerDesc,
    },
-    walredo,
 };
 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -84,8 +84,8 @@ use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
        AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
-        ValuesReconstructState,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
+        ValueReconstructState, ValuesReconstructState,
    },
 };
 use crate::{
@@ -137,13 +137,10 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
+use super::{config::TenantConf, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{
-    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
-    storage_layer::ReadableLayer,
-};
+use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
 use super::{
    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
    GcError,
@@ -446,8 +443,6 @@ pub struct Timeline {
    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,

    pub(crate) l0_flush_global_state: L0FlushGlobalState,
-
-    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 }

 pub struct WalReceiverInfo {
@@ -553,21 +548,17 @@ impl GetVectoredError {
    }
 }

+#[derive(Debug)]
 pub struct MissingKeyError {
    key: Key,
    shard: ShardNumber,
    cont_lsn: Lsn,
    request_lsn: Lsn,
    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
    backtrace: Option<std::backtrace::Backtrace>,
 }

-impl std::fmt::Debug for MissingKeyError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self)
-    }
-}
-
 impl std::fmt::Display for MissingKeyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -579,6 +570,18 @@ impl std::fmt::Display for MissingKeyError {
            write!(f, ", ancestor {}", ancestor_lsn)?;
        }

+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
        if let Some(ref backtrace) = self.backtrace {
            write!(f, "\n{}", backtrace)?;
        }
@@ -707,7 +710,6 @@ pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
    EnhancedGcBottomMostCompaction,
-    DryRun,
 }

 impl std::fmt::Debug for Timeline {
@@ -921,44 +923,116 @@ impl Timeline {

        self.timeline_get_throttle.throttle(ctx, 1).await;

-        let keyspace = KeySpace {
-            ranges: vec![key..key.next()],
-        };
+        match self.conf.get_impl {
+            GetImpl::Legacy => {
+                let reconstruct_state = ValueReconstructState {
+                    records: Vec::new(),
+                    img: None,
+                };

-        // Initialise the reconstruct state for the key with the cache
-        // entry returned above.
-        let mut reconstruct_state = ValuesReconstructState::new();
+                self.get_impl(key, lsn, reconstruct_state, ctx).await
+            }
+            GetImpl::Vectored => {
+                let keyspace = KeySpace {
+                    ranges: vec![key..key.next()],
+                };

-        let vectored_res = self
-            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
-            .await;
+                // Initialise the reconstruct state for the key with the cache
+                // entry returned above.
+                let mut reconstruct_state = ValuesReconstructState::new();

-        let key_value = vectored_res?.pop_first();
-        match key_value {
-            Some((got_key, value)) => {
-                if got_key != key {
-                    error!(
-                        "Expected {}, but singular vectored get returned {}",
-                        key, got_key
-                    );
-                    Err(PageReconstructError::Other(anyhow!(
-                        "Singular vectored get returned wrong key"
-                    )))
-                } else {
-                    value
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                    .await;
+
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
+
+                let key_value = vectored_res?.pop_first();
+                match key_value {
+                    Some((got_key, value)) => {
+                        if got_key != key {
+                            error!(
+                                "Expected {}, but singular vectored get returned {}",
+                                key, got_key
+                            );
+                            Err(PageReconstructError::Other(anyhow!(
+                                "Singular vectored get returned wrong key"
+                            )))
+                        } else {
+                            value
+                        }
+                    }
+                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn: Lsn(0),
+                        request_lsn: lsn,
+                        ancestor_lsn: None,
+                        traversal_path: Vec::new(),
+                        backtrace: None,
+                    })),
                }
            }
-            None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                key,
-                shard: self.shard_identity.get_shard_number(&key),
-                cont_lsn: Lsn(0),
-                request_lsn: lsn,
-                ancestor_lsn: None,
-                backtrace: None,
-            })),
        }
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        mut reconstruct_state: ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
+
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(GetKind::Singular)
+            .start_timer();
+        let path = self
+            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
+            .await?;
+        timer.stop_and_record();
+
+        let start = Instant::now();
+        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
+        let elapsed = start.elapsed();
+        crate::metrics::RECONSTRUCT_TIME
+            .for_get_kind(GetKind::Singular)
+            .observe(elapsed.as_secs_f64());
+
+        if cfg!(feature = "testing") && res.is_err() {
+            // it can only be walredo issue
+            use std::fmt::Write;
+
+            let mut msg = String::new();
+
+            path.into_iter().for_each(|(res, cont_lsn, layer)| {
+                writeln!(
+                    msg,
+                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
+                    layer,
+                )
+                .expect("string grows")
+            });
+
+            // this is to rule out or provide evidence that we could in some cases read a duplicate
+            // walrecord
+            tracing::info!("walredo failed, path:\n{msg}");
+        }
+
+        res
+    }
+
    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;

@@ -1008,14 +1082,28 @@ impl Timeline {
            .throttle(ctx, key_count as usize)
            .await;

-        let res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(),
-                ctx,
-            )
-            .await;
+        let res = match self.conf.get_vectored_impl {
+            GetVectoredImpl::Sequential => {
+                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
+            }
+            GetVectoredImpl::Vectored => {
+                let vectored_res = self
+                    .get_vectored_impl(
+                        keyspace.clone(),
+                        lsn,
+                        &mut ValuesReconstructState::new(),
+                        ctx,
+                    )
+                    .await;
+
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
+
+                vectored_res
+            }
+        };

        if let Some((metric, start)) = start {
            let elapsed = start.elapsed();
@@ -1104,6 +1192,65 @@ impl Timeline {
        vectored_res
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
+    pub(super) async fn get_vectored_sequential_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut values = BTreeMap::new();
+
+        for range in keyspace.ranges {
+            let mut key = range.start;
+            while key != range.end {
+                let block = self
+                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
+                    .await;
+
+                use PageReconstructError::*;
+                match block {
+                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
+                    Err(MissingKey(_))
+                        if NON_INHERITED_RANGE.contains(&key)
+                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
+                    {
+                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
+                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
+                        // accordingly.
+                        key = key.next();
+                    }
+                    Err(MissingKey(err)) => {
+                        return Err(GetVectoredError::MissingKey(err));
+                    }
+                    Err(Other(err))
+                        if err
+                            .to_string()
+                            .contains("downloading evicted layer file failed") =>
+                    {
+                        return Err(GetVectoredError::Other(err))
+                    }
+                    Err(Other(err))
+                        if err
+                            .chain()
+                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
+                    {
+                        // The intent here is to achieve error parity with the vectored read path.
+                        // When vectored read fails to load a layer it fails the whole read, hence
+                        // we mimic this behaviour here to keep the validation happy.
+                        return Err(GetVectoredError::Other(err));
+                    }
+                    _ => {
+                        values.insert(key, block);
+                        key = key.next();
+                    }
+                }
+            }
+        }
+
+        Ok(values)
+    }
+
    pub(super) async fn get_vectored_impl(
        &self,
        keyspace: KeySpace,
@@ -1174,6 +1321,113 @@ impl Timeline {
        Ok(results)
    }

+    /// Not subject to [`Self::timeline_get_throttle`].
+    pub(super) async fn validate_get_vectored_impl(
+        &self,
+        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) {
+        if keyspace.overlaps(&Key::metadata_key_range()) {
+            // skip validation for metadata key range
+            return;
+        }
+
+        let sequential_res = self
+            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
+            .await;
+
+        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
+            use GetVectoredError::*;
+            match (lhs, rhs) {
+                (Oversized(l), Oversized(r)) => l == r,
+                (InvalidLsn(l), InvalidLsn(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l.key == r.key,
+                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
+                (Other(_), Other(_)) => true,
+                _ => false,
+            }
+        }
+
+        match (&sequential_res, vectored_res) {
+            (Err(GetVectoredError::Cancelled), _) => {},
+            (_, Err(GetVectoredError::Cancelled)) => {},
+            (Err(seq_err), Ok(_)) => {
+                panic!(concat!("Sequential get failed with {}, but vectored get did not",
+                               " - keyspace={:?} lsn={}"),
+                       seq_err, keyspace, lsn) },
+            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
+                // Sequential get runs after vectored get, so it is possible for the later
+                // to time out while waiting for its ancestor's Lsn to become ready and for the
+                // former to succeed (it essentially has a doubled wait time).
+            },
+            (Ok(_), Err(vec_err)) => {
+                panic!(concat!("Vectored get failed with {}, but sequential get did not",
+                               " - keyspace={:?} lsn={}"),
+                       vec_err, keyspace, lsn) },
+            (Err(seq_err), Err(vec_err)) => {
+                assert!(errors_match(seq_err, vec_err),
+                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
+            (Ok(seq_values), Ok(vec_values)) => {
+                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
+                    assert_eq!(seq_key, vec_key);
+                    match (seq_res, vec_res) {
+                        (Ok(seq_blob), Ok(vec_blob)) => {
+                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
+                        },
+                        (Err(err), Ok(_)) => {
+                            panic!(
+                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Ok(_), Err(err)) => {
+                            panic!(
+                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Err(_), Err(_)) => {}
+                    }
+                })
+            }
+        }
+    }
+
+    fn validate_key_equivalence(
+        key: &Key,
+        keyspace: &KeySpace,
+        lsn: Lsn,
+        seq: &Bytes,
+        vec: &Bytes,
+    ) {
+        if *key == AUX_FILES_KEY {
+            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
+            // since it uses a hash map under the hood. Hence, deserialise both results
+            // before comparing.
+            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
+            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
+            match (&seq_aux_dir_res, &vec_aux_dir_res) {
+                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
+                    assert_eq!(
+                        seq_aux_dir, vec_aux_dir,
+                        "Mismatch for key {} - keyspace={:?} lsn={}",
+                        key, keyspace, lsn
+                    );
+                }
+                (Err(_), Err(_)) => {}
+                _ => {
+                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
+                }
+            }
+        } else {
+            // All other keys should reconstruct deterministically, so we simply compare the blobs.
+            assert_eq!(
+                seq, vec,
+                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
+            );
+        }
+    }
+
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
@@ -1675,9 +1929,6 @@ impl Timeline {
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

-        // Ensure Prevent new page service requests from starting.
-        self.handles.shutdown();
-
        // Transition the remote_client into a state where it's only useful for timeline deletion.
        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
        self.remote_client.stop();
@@ -2203,8 +2454,6 @@ impl Timeline {
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),

                l0_flush_global_state: resources.l0_flush_global_state,
-
-                handles: Default::default(),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2488,10 +2737,6 @@ impl Timeline {
        // Tenant::create_timeline will wait for these uploads to happen before returning, or
        // on retry.

-        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
-        drop(guard); // drop write lock, update_layer_visibility will take a read lock.
-        self.update_layer_visibility().await;
-
        info!(
            "loaded layer map with {} layers at {}, total physical size: {}",
            num_layers, disk_consistent_lsn, total_physical_size
@@ -2938,22 +3183,14 @@ impl Timeline {

        let guard = self.layers.read().await;

-        let resident = guard.likely_resident_layers().filter_map(|layer| {
-            match layer.visibility() {
-                LayerVisibilityHint::Visible => {
-                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
-                    let last_activity_ts = layer.latest_activity();
-                    Some(HeatMapLayer::new(
-                        layer.layer_desc().layer_name(),
-                        layer.metadata(),
-                        last_activity_ts,
-                    ))
-                }
-                LayerVisibilityHint::Covered => {
-                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
-                    None
-                }
-            }
+        let resident = guard.likely_resident_layers().map(|layer| {
+            let last_activity_ts = layer.access_stats().latest_activity();
+
+            HeatMapLayer::new(
+                layer.layer_desc().layer_name(),
+                layer.metadata(),
+                last_activity_ts,
+            )
        });

        let layers = resident.collect();
@@ -2971,7 +3208,228 @@ impl Timeline {
    }
 }

+type TraversalId = Arc<str>;
+
+trait TraversalLayerExt {
+    fn traversal_id(&self) -> TraversalId;
+}
+
+impl TraversalLayerExt for Layer {
+    fn traversal_id(&self) -> TraversalId {
+        Arc::clone(self.debug_str())
+    }
+}
+
+impl TraversalLayerExt for Arc<InMemoryLayer> {
+    fn traversal_id(&self) -> TraversalId {
+        Arc::clone(self.local_path_str())
+    }
+}
+
 impl Timeline {
+    ///
+    /// Get a handle to a Layer for reading.
+    ///
+    /// The returned Layer might be from an ancestor timeline, if the
+    /// segment hasn't been updated on this timeline yet.
+    ///
+    /// This function takes the current timeline's locked LayerMap as an argument,
+    /// so callers can avoid potential race conditions.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    async fn get_reconstruct_data(
+        &self,
+        key: Key,
+        request_lsn: Lsn,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
+        // Start from the current timeline.
+        let mut timeline_owned;
+        let mut timeline = self;
+
+        let mut read_count = scopeguard::guard(0, |cnt| {
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
+        });
+
+        // For debugging purposes, collect the path of layers that we traversed
+        // through. It's included in the error message if we fail to find the key.
+        let mut traversal_path = Vec::<TraversalPathItem>::new();
+
+        let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
+            *cached_lsn
+        } else {
+            Lsn(0)
+        };
+
+        // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
+        // to check that each iteration make some progress, to break infinite
+        // looping if something goes wrong.
+        let mut prev_lsn = None;
+
+        let mut result = ValueReconstructResult::Continue;
+        let mut cont_lsn = Lsn(request_lsn.0 + 1);
+
+        'outer: loop {
+            if self.cancel.is_cancelled() {
+                return Err(PageReconstructError::Cancelled);
+            }
+
+            // The function should have updated 'state'
+            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
+            match result {
+                ValueReconstructResult::Complete => return Ok(traversal_path),
+                ValueReconstructResult::Continue => {
+                    // If we reached an earlier cached page image, we're done.
+                    if cont_lsn == cached_lsn + 1 {
+                        return Ok(traversal_path);
+                    }
+                    if let Some(prev) = prev_lsn {
+                        if prev <= cont_lsn {
+                            // Didn't make any progress in last iteration. Error out to avoid
+                            // getting stuck in the loop.
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
+                                key,
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
+                                request_lsn,
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
+                        }
+                    }
+                    prev_lsn = Some(cont_lsn);
+                }
+                ValueReconstructResult::Missing => {
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
+                        traversal_path,
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
+                }
+            }
+
+            // Recurse into ancestor if needed
+            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
+                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                    trace!(
+                        "going into ancestor {}, cont_lsn is {}",
+                        timeline.ancestor_lsn,
+                        cont_lsn
+                    );
+
+                    timeline_owned = timeline
+                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                        .await?;
+                    timeline = &*timeline_owned;
+                    prev_lsn = None;
+                    continue 'outer;
+                }
+            }
+
+            let guard = timeline.layers.read().await;
+            let layers = guard.layer_map();
+
+            // Check the open and frozen in-memory layers first, in order from newest
+            // to oldest.
+            if let Some(open_layer) = &layers.open_layer {
+                let start_lsn = open_layer.get_lsn_range().start;
+                if cont_lsn > start_lsn {
+                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
+                    // Get all the data needed to reconstruct the page version from this layer.
+                    // But if we have an older cached page image, no need to go past that.
+                    let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let open_layer = open_layer.clone();
+                    drop(guard);
+
+                    result = match open_layer
+                        .get_value_reconstruct_data(
+                            key,
+                            lsn_floor..cont_lsn,
+                            reconstruct_state,
+                            ctx,
+                        )
+                        .await
+                    {
+                        Ok(result) => result,
+                        Err(e) => return Err(PageReconstructError::from(e)),
+                    };
+                    cont_lsn = lsn_floor;
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
+                    continue 'outer;
+                }
+            }
+            for frozen_layer in layers.frozen_layers.iter().rev() {
+                let start_lsn = frozen_layer.get_lsn_range().start;
+                if cont_lsn > start_lsn {
+                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
+                    let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let frozen_layer = frozen_layer.clone();
+                    drop(guard);
+
+                    result = match frozen_layer
+                        .get_value_reconstruct_data(
+                            key,
+                            lsn_floor..cont_lsn,
+                            reconstruct_state,
+                            ctx,
+                        )
+                        .await
+                    {
+                        Ok(result) => result,
+                        Err(e) => return Err(PageReconstructError::from(e)),
+                    };
+                    cont_lsn = lsn_floor;
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
+                    continue 'outer;
+                }
+            }
+
+            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
+                let layer = guard.get_from_desc(&layer);
+                drop(guard);
+                // Get all the data needed to reconstruct the page version from this layer.
+                // But if we have an older cached page image, no need to go past that.
+                let lsn_floor = max(cached_lsn + 1, lsn_floor);
+                result = match layer
+                    .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx)
+                    .await
+                {
+                    Ok(result) => result,
+                    Err(e) => return Err(PageReconstructError::from(e)),
+                };
+                cont_lsn = lsn_floor;
+                *read_count += 1;
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
+                continue 'outer;
+            } else if timeline.ancestor_timeline.is_some() {
+                // Nothing on this timeline. Traverse to parent
+                result = ValueReconstructResult::Continue;
+                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+                continue 'outer;
+            } else {
+                // Nothing found
+                result = ValueReconstructResult::Missing;
+                continue 'outer;
+            }
+        }
+    }
+
    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
@@ -3065,6 +3523,7 @@ impl Timeline {
                cont_lsn,
                request_lsn,
                ancestor_lsn: Some(timeline.ancestor_lsn),
+                traversal_path: vec![],
                backtrace: None,
            }));
        }
@@ -3264,17 +3723,6 @@ impl Timeline {
        &self.shard_identity
    }

-    #[inline(always)]
-    pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
-        ShardTimelineId {
-            shard_index: ShardIndex {
-                shard_number: self.shard_identity.number,
-                shard_count: self.shard_identity.count,
-            },
-            timeline_id: self.timeline_id,
-        }
-    }
-
    ///
    /// Get a handle to the latest layer for appending.
    ///
@@ -3627,21 +4075,6 @@ impl Timeline {
            // release lock on 'layers'
        };

-        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote.
-        self.remote_client
-            .wait_completion()
-            .await
-            .map_err(|e| match e {
-                WaitCompletionError::UploadQueueShutDownOrStopped
-                | WaitCompletionError::NotInitialized(
-                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                ) => FlushLayerError::Cancelled,
-                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                    FlushLayerError::Other(anyhow!(e).into())
-                }
-            })?;
-
        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
@@ -3657,11 +4090,17 @@ impl Timeline {

    /// Return true if the value changed
    ///
-    /// This function must only be used from the layer flush task.
+    /// This function must only be used from the layer flush task, and may not be called concurrently.
    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
-        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
-        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
-        new_value != old_value
+        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
+        let old_value = self.disk_consistent_lsn.load();
+        if new_value != old_value {
+            assert!(new_value >= old_value);
+            self.disk_consistent_lsn.store(new_value);
+            true
+        } else {
+            false
+        }
    }

    /// Update metadata file
@@ -3728,14 +4167,12 @@ impl Timeline {
        let frozen_layer = Arc::clone(frozen_layer);
        let ctx = ctx.attached_child();
        let work = async move {
-            let Some((desc, path)) = frozen_layer
-                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
+            let Some(new_delta) = frozen_layer
+                .write_to_disk(&self_clone, &ctx, key_range)
                .await?
            else {
                return Ok(None);
            };
-            let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?;
-
            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
            // We just need to fsync the directory in which these inodes are linked,
            // which we know to be the timeline directory.
@@ -4230,6 +4667,27 @@ impl Timeline {
            }
        }

+        // The writer.finish() above already did the fsync of the inodes.
+        // We just need to fsync the directory in which these inodes are linked,
+        // which we know to be the timeline directory.
+        if !image_layers.is_empty() {
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
+
        let mut guard = self.layers.write().await;

        // FIXME: we could add the images to be uploaded *before* returning from here, but right
@@ -4238,9 +4696,6 @@ impl Timeline {
        drop_wlock(guard);
        timer.stop_and_record();

-        // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await;
-
        Ok(image_layers)
    }

@@ -4258,12 +4713,6 @@ impl Timeline {
            return;
        }

-        if self.current_logical_size.current_size().is_exact() {
-            // root timelines are initialized with exact count, but never start the background
-            // calculation
-            return;
-        }
-
        if let Some(await_bg_cancel) = self
            .current_logical_size
            .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -5011,22 +5460,20 @@ impl Timeline {
                } else {
                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                };
-                let res = self
+
+                let img = match self
                    .walredo_mgr
                    .as_ref()
                    .context("timeline has no walredo manager")
                    .map_err(PageReconstructError::WalRedo)?
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .await;
-                let img = match res {
+                    .await
+                    .context("reconstruct a page image")
+                {
                    Ok(img) => img,
-                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
-                    Err(walredo::Error::Other(e)) => {
-                        return Err(PageReconstructError::WalRedo(
-                            e.context("reconstruct a page image"),
-                        ))
-                    }
+                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                };
+
                Ok(img)
            }
        }
@@ -5211,7 +5658,7 @@ impl Timeline {
                let file_size = layer.layer_desc().file_size;
                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));

-                let last_activity_ts = layer.latest_activity();
+                let last_activity_ts = layer.access_stats().latest_activity();

                EvictionCandidate {
                    layer: layer.into(),
@@ -5234,22 +5681,6 @@ impl Timeline {
        }
    }

-    /// Persistently blocks gc for `Manual` reason.
-    ///
-    /// Returns true if no such block existed before, false otherwise.
-    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
-        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
-        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
-        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
-    }
-
-    /// Persistently unblocks gc for `Manual` reason.
-    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
-        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
-        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
-        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
-    }
-
    #[cfg(test)]
    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
        self.last_record_lsn.advance(new_lsn);
@@ -5368,8 +5799,9 @@ impl Timeline {
        for (key, lsn, val) in deltas.data {
            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
        }
-        let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+        let delta_layer = delta_layer_writer
+            .finish(deltas.key_range.end, self, ctx)
+            .await?;

        {
            let mut guard = self.layers.write().await;
@@ -5430,6 +5862,8 @@ impl Timeline {
    }
 }

+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
+
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

-use std::collections::{BinaryHeap, HashSet};
+use std::collections::BinaryHeap;
 use std::ops::{Deref, Range};
 use std::sync::Arc;

@@ -15,14 +15,11 @@ use super::{
 };

 use anyhow::{anyhow, Context};
-use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
-use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -32,9 +29,7 @@ use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::{
-    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
@@ -43,7 +38,6 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
-use crate::walrecord::NeonWalRecord;

 use utils::lsn::Lsn;

@@ -75,130 +69,37 @@ impl KeyHistoryRetention {
        self,
        key: Key,
        delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        mut image_writer: Option<&mut ImageLayerWriter>,
-        stat: &mut CompactionStatistics,
+        image_writer: &mut ImageLayerWriter,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
-        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
+        for (_, KeyLogAtLsn(logs)) in self.below_horizon {
            if first_batch {
                if logs.len() == 1 && logs[0].1.is_image() {
                    let Value::Image(img) = &logs[0].1 else {
                        unreachable!()
                    };
-                    stat.produce_image_key(img);
-                    if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer.put_image(key, img.clone(), ctx).await?;
-                    } else {
-                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
-                    }
+                    image_writer.put_image(key, img.clone(), ctx).await?;
                } else {
                    for (lsn, val) in logs {
-                        stat.produce_key(&val);
                        delta_writer.push((key, lsn, val));
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
-                    stat.produce_key(&val);
                    delta_writer.push((key, lsn, val));
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
-            stat.produce_key(&val);
            delta_writer.push((key, lsn, val));
        }
        Ok(())
    }
 }

-#[derive(Debug, Serialize, Default)]
-struct CompactionStatisticsNumSize {
-    num: u64,
-    size: u64,
-}
-
-#[derive(Debug, Serialize, Default)]
-pub struct CompactionStatistics {
-    delta_layer_visited: CompactionStatisticsNumSize,
-    image_layer_visited: CompactionStatisticsNumSize,
-    delta_layer_produced: CompactionStatisticsNumSize,
-    image_layer_produced: CompactionStatisticsNumSize,
-    num_delta_layer_discarded: usize,
-    num_image_layer_discarded: usize,
-    num_unique_keys_visited: usize,
-    wal_keys_visited: CompactionStatisticsNumSize,
-    image_keys_visited: CompactionStatisticsNumSize,
-    wal_produced: CompactionStatisticsNumSize,
-    image_produced: CompactionStatisticsNumSize,
-}
-
-impl CompactionStatistics {
-    fn estimated_size_of_value(val: &Value) -> usize {
-        match val {
-            Value::Image(img) => img.len(),
-            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
-            _ => std::mem::size_of::<NeonWalRecord>(),
-        }
-    }
-    fn estimated_size_of_key() -> usize {
-        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
-    }
-    fn visit_delta_layer(&mut self, size: u64) {
-        self.delta_layer_visited.num += 1;
-        self.delta_layer_visited.size += size;
-    }
-    fn visit_image_layer(&mut self, size: u64) {
-        self.image_layer_visited.num += 1;
-        self.image_layer_visited.size += size;
-    }
-    fn on_unique_key_visited(&mut self) {
-        self.num_unique_keys_visited += 1;
-    }
-    fn visit_wal_key(&mut self, val: &Value) {
-        self.wal_keys_visited.num += 1;
-        self.wal_keys_visited.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn visit_image_key(&mut self, val: &Value) {
-        self.image_keys_visited.num += 1;
-        self.image_keys_visited.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn produce_key(&mut self, val: &Value) {
-        match val {
-            Value::Image(img) => self.produce_image_key(img),
-            Value::WalRecord(_) => self.produce_wal_key(val),
-        }
-    }
-    fn produce_wal_key(&mut self, val: &Value) {
-        self.wal_produced.num += 1;
-        self.wal_produced.size +=
-            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn produce_image_key(&mut self, val: &Bytes) {
-        self.image_produced.num += 1;
-        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
-    }
-    fn discard_delta_layer(&mut self) {
-        self.num_delta_layer_discarded += 1;
-    }
-    fn discard_image_layer(&mut self) {
-        self.num_image_layer_discarded += 1;
-    }
-    fn produce_delta_layer(&mut self, size: u64) {
-        self.delta_layer_produced.num += 1;
-        self.delta_layer_produced.size += size;
-    }
-    fn produce_image_layer(&mut self, size: u64) {
-        self.image_layer_produced.num += 1;
-        self.image_layer_produced.size += size;
-    }
-}
-
 impl Timeline {
    /// TODO: cancellation
    ///
@@ -210,18 +111,12 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, CompactionError> {
        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, flags, ctx)
+            self.compact_with_gc(cancel, ctx)
                .await
                .map_err(CompactionError::Other)?;
            return Ok(false);
        }

-        if flags.contains(CompactFlags::DryRun) {
-            return Err(CompactionError::Other(anyhow!(
-                "dry-run mode is not supported for legacy compaction for now"
-            )));
-        }
-
        // High level strategy for compaction / image creation:
        //
        // 1. First, calculate the desired "partitioning" of the
@@ -543,45 +438,6 @@ impl Timeline {
        Ok(())
    }

-    /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
-    /// an image layer between them and the most recent readable LSN (branch point or tip of timeline).  The
-    /// purpose of the visibility hint is to record which layers need to be available to service reads.
-    ///
-    /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
-    /// that we know won't be needed for reads.
-    pub(super) async fn update_layer_visibility(&self) {
-        let head_lsn = self.get_last_record_lsn();
-
-        // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
-        // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
-        // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
-        // they will be subject to L0->L1 compaction in the near future.
-        let layer_manager = self.layers.read().await;
-        let layer_map = layer_manager.layer_map();
-
-        let readable_points = {
-            let children = self.gc_info.read().unwrap().retain_lsns.clone();
-
-            let mut readable_points = Vec::with_capacity(children.len() + 1);
-            for (child_lsn, _child_timeline_id) in &children {
-                readable_points.push(*child_lsn);
-            }
-            readable_points.push(head_lsn);
-            readable_points
-        };
-
-        let (layer_visibility, covered) = layer_map.get_visibility(readable_points);
-        for (layer_desc, visibility) in layer_visibility {
-            // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
-            let layer = layer_manager.get_from_desc(&layer_desc);
-            layer.set_visibility(visibility);
-        }
-
-        // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
-        // avoid assuming that everything at a branch point is visible.
-        drop(covered);
-    }
-
    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
    /// as Level 1 files. Returns whether the L0 layers are fully compacted.
    async fn compact_level0(
@@ -939,16 +795,14 @@ impl Timeline {
                        || contains_hole
                    {
                        // ... if so, flush previous layer and prepare to write new one
-                        let (desc, path) = writer
-                            .take()
-                            .unwrap()
-                            .finish(prev_key.unwrap().next(), ctx)
-                            .await
-                            .map_err(CompactionError::Other)?;
-                        let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
-                            .map_err(CompactionError::Other)?;
-
-                        new_layers.push(new_delta);
+                        new_layers.push(
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next(), self, ctx)
+                                .await
+                                .map_err(CompactionError::Other)?,
+                        );
                        writer = None;

                        if contains_hole {
@@ -1011,13 +865,12 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            let (desc, path) = writer
-                .finish(prev_key.unwrap().next(), ctx)
-                .await
-                .map_err(CompactionError::Other)?;
-            let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
-                .map_err(CompactionError::Other)?;
-            new_layers.push(new_delta);
+            new_layers.push(
+                writer
+                    .finish(prev_key.unwrap().next(), self, ctx)
+                    .await
+                    .map_err(CompactionError::Other)?,
+            );
        }

        // Sync layers
@@ -1265,22 +1118,21 @@ impl Timeline {
    pub(crate) async fn generate_key_retention(
        self: &Arc<Timeline>,
        key: Key,
-        full_history: &[(Key, Lsn, Value)],
+        history: &[(Key, Lsn, Value)],
        horizon: Lsn,
        retain_lsn_below_horizon: &[Lsn],
        delta_threshold_cnt: usize,
-        base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
    ) -> anyhow::Result<KeyHistoryRetention> {
        // Pre-checks for the invariants
        if cfg!(debug_assertions) {
-            for (log_key, _, _) in full_history {
+            for (log_key, _, _) in history {
                assert_eq!(log_key, &key, "mismatched key");
            }
-            for i in 1..full_history.len() {
-                assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN");
-                if full_history[i - 1].1 == full_history[i].1 {
+            for i in 1..history.len() {
+                assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
+                if history[i - 1].1 == history[i].1 {
                    assert!(
-                        matches!(full_history[i - 1].2, Value::Image(_)),
+                        matches!(history[i - 1].2, Value::Image(_)),
                        "unordered delta/image, or duplicated delta"
                    );
                }
@@ -1299,7 +1151,6 @@ impl Timeline {
                );
            }
        }
-        let has_ancestor = base_img_from_ancestor.is_some();
        // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
        // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
        let (mut split_history, lsn_split_points) = {
@@ -1311,7 +1162,7 @@ impl Timeline {
            }
            lsn_split_points.push(horizon);
            let mut current_idx = 0;
-            for item @ (_, lsn, _) in full_history {
+            for item @ (_, lsn, _) in history {
                while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
                    current_idx += 1;
                }
@@ -1333,9 +1184,6 @@ impl Timeline {
                        // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
                        // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
                        // dropped.
-                        //
-                        // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
-                        // threshold, we could have kept delta instead to save space. This is an optimization for the future.
                        continue;
                    }
                }
@@ -1353,75 +1201,9 @@ impl Timeline {
            "should have at least below + above horizon batches"
        );
        let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
-        if let Some((key, lsn, img)) = base_img_from_ancestor {
-            replay_history.push((key, lsn, Value::Image(img)));
-        }
-
-        /// Generate debug information for the replay history
-        fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String {
-            use std::fmt::Write;
-            let mut output = String::new();
-            if let Some((key, _, _)) = replay_history.first() {
-                write!(output, "key={} ", key).unwrap();
-                let mut cnt = 0;
-                for (_, lsn, val) in replay_history {
-                    if val.is_image() {
-                        write!(output, "i@{} ", lsn).unwrap();
-                    } else if val.will_init() {
-                        write!(output, "di@{} ", lsn).unwrap();
-                    } else {
-                        write!(output, "d@{} ", lsn).unwrap();
-                    }
-                    cnt += 1;
-                    if cnt >= 128 {
-                        write!(output, "... and more").unwrap();
-                        break;
-                    }
-                }
-            } else {
-                write!(output, "<no history>").unwrap();
-            }
-            output
-        }
-
-        fn generate_debug_trace(
-            replay_history: Option<&[(Key, Lsn, Value)]>,
-            full_history: &[(Key, Lsn, Value)],
-            lsns: &[Lsn],
-            horizon: Lsn,
-        ) -> String {
-            use std::fmt::Write;
-            let mut output = String::new();
-            if let Some(replay_history) = replay_history {
-                writeln!(
-                    output,
-                    "replay_history: {}",
-                    generate_history_trace(replay_history)
-                )
-                .unwrap();
-            } else {
-                writeln!(output, "replay_history: <disabled>",).unwrap();
-            }
-            writeln!(
-                output,
-                "full_history: {}",
-                generate_history_trace(full_history)
-            )
-            .unwrap();
-            writeln!(
-                output,
-                "when processing: [{}] horizon={}",
-                lsns.iter().map(|l| format!("{l}")).join(","),
-                horizon
-            )
-            .unwrap();
-            output
-        }
-
        for (i, split_for_lsn) in split_history.into_iter().enumerate() {
-            // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
            records_since_last_image += split_for_lsn.len();
-            let generate_image = if i == 0 && !has_ancestor {
+            let generate_image = if i == 0 {
                // We always generate images for the first batch (below horizon / lowest retain_lsn)
                true
            } else if i == batch_cnt - 1 {
@@ -1442,27 +1224,10 @@ impl Timeline {
                }
            }
            if let Some((_, _, val)) = replay_history.first() {
-                if !val.will_init() {
-                    return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
-                        || {
-                            generate_debug_trace(
-                                Some(&replay_history),
-                                full_history,
-                                retain_lsn_below_horizon,
-                                horizon,
-                            )
-                        },
-                    );
-                }
+                assert!(val.will_init(), "invalid history, no base image");
            }
            if generate_image && records_since_last_image > 0 {
                records_since_last_image = 0;
-                let replay_history_for_debug = if cfg!(debug_assertions) {
-                    Some(replay_history.clone())
-                } else {
-                    None
-                };
-                let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
                let history = std::mem::take(&mut replay_history);
                let mut img = None;
                let mut records = Vec::with_capacity(history.len());
@@ -1470,30 +1235,14 @@ impl Timeline {
                    img = Some((*lsn, val.clone()));
                    for (_, lsn, val) in history.into_iter().skip(1) {
                        let Value::WalRecord(rec) = val else {
-                            return Err(anyhow::anyhow!(
-                                "invalid record, first record is image, expect walrecords"
-                            ))
-                            .with_context(|| {
-                                generate_debug_trace(
-                                    replay_history_for_debug_ref,
-                                    full_history,
-                                    retain_lsn_below_horizon,
-                                    horizon,
-                                )
-                            });
+                            panic!("invalid record")
                        };
                        records.push((lsn, rec));
                    }
                } else {
                    for (_, lsn, val) in history.into_iter() {
                        let Value::WalRecord(rec) = val else {
-                            return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord"))
-                                .with_context(|| generate_debug_trace(
-                                    replay_history_for_debug_ref,
-                                    full_history,
-                                    retain_lsn_below_horizon,
-                                    horizon,
-                                ));
+                            panic!("invalid record")
                        };
                        records.push((lsn, rec));
                    }
@@ -1505,11 +1254,12 @@ impl Timeline {
                replay_history.push((key, request_lsn, Value::Image(img.clone())));
                retention.push(vec![(request_lsn, Value::Image(img))]);
            } else {
-                let deltas = split_for_lsn
-                    .iter()
-                    .map(|(_, lsn, value)| (*lsn, value.clone()))
-                    .collect_vec();
-                retention.push(deltas);
+                retention.push(
+                    split_for_lsn
+                        .iter()
+                        .map(|(_, lsn, value)| (*lsn, value.clone()))
+                        .collect(),
+                );
            }
        }
        let mut result = Vec::with_capacity(retention.len());
@@ -1524,7 +1274,7 @@ impl Timeline {
                result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
            }
        }
-        unreachable!("key retention is empty")
+        unreachable!()
    }

    /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1535,41 +1285,17 @@ impl Timeline {
    /// and create delta layers with all deltas >= gc horizon.
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        _cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        use std::collections::BTreeSet;

-        // Block other compaction/GC tasks from running for now. GC-compaction could run along
-        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
-        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
-
-        let gc_lock = async {
-            tokio::select! {
-                guard = self.gc_lock.lock() => Ok(guard),
-                // TODO: refactor to CompactionError to correctly pass cancelled error
-                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
-            }
-        };
-
-        let gc_lock = crate::timed(
-            gc_lock,
-            "acquires gc lock",
-            std::time::Duration::from_secs(5),
-        )
-        .await?;
-
-        let dry_run = flags.contains(CompactFlags::DryRun);
-
-        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        info!("running enhanced gc bottom-most compaction");

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
        };

-        let mut stat = CompactionStatistics::default();
-
        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -1600,25 +1326,20 @@ impl Timeline {
            retain_lsns_below_horizon.sort();
            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
        };
-        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            Lsn(self.ancestor_lsn.0 + 1)
-        } else {
-            let res = retain_lsns_below_horizon
-                .first()
-                .copied()
-                .unwrap_or(gc_cutoff);
-            if cfg!(debug_assertions) {
-                assert_eq!(
-                    res,
-                    retain_lsns_below_horizon
-                        .iter()
-                        .min()
-                        .copied()
-                        .unwrap_or(gc_cutoff)
-                );
-            }
-            res
-        };
+        let lowest_retain_lsn = retain_lsns_below_horizon
+            .first()
+            .copied()
+            .unwrap_or(gc_cutoff);
+        if cfg!(debug_assertions) {
+            assert_eq!(
+                lowest_retain_lsn,
+                retain_lsns_below_horizon
+                    .iter()
+                    .min()
+                    .copied()
+                    .unwrap_or(gc_cutoff)
+            );
+        }
        info!(
            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
            layer_selection.len(),
@@ -1640,9 +1361,6 @@ impl Timeline {
                let key_range = desc.get_key_range();
                delta_split_points.insert(key_range.start);
                delta_split_points.insert(key_range.end);
-                stat.visit_delta_layer(desc.file_size());
-            } else {
-                stat.visit_image_layer(desc.file_size());
            }
        }
        let mut delta_layers = Vec::new();
@@ -1662,14 +1380,6 @@ impl Timeline {
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

-        enum FlushDeltaResult {
-            /// Create a new resident layer
-            CreateResidentLayer(ResidentLayer),
-            /// Keep an original delta layer
-            KeepLayer(PersistentLayerKey),
-        }
-
-        #[allow(clippy::too_many_arguments)]
        async fn flush_deltas(
            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
            last_key: Key,
@@ -1678,10 +1388,7 @@ impl Timeline {
            tline: &Arc<Timeline>,
            lowest_retain_lsn: Lsn,
            ctx: &RequestContext,
-            stats: &mut CompactionStatistics,
-            dry_run: bool,
-            last_batch: bool,
-        ) -> anyhow::Result<Option<FlushDeltaResult>> {
+        ) -> anyhow::Result<Option<ResidentLayer>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
            // overlapping layers.
            //
@@ -1701,176 +1408,46 @@ impl Timeline {
                *current_delta_split_point += 1;
                need_split = true;
            }
-            if !need_split && !last_batch {
+            if !need_split {
                return Ok(None);
            }
-            let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
+            let deltas = std::mem::take(deltas);
            if deltas.is_empty() {
                return Ok(None);
            }
            let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
-            let delta_key = PersistentLayerKey {
-                key_range: {
-                    let key_start = deltas.first().unwrap().0;
-                    let key_end = deltas.last().unwrap().0.next();
-                    key_start..key_end
-                },
-                lsn_range: lowest_retain_lsn..end_lsn,
-                is_delta: true,
-            };
-            {
-                // Hack: skip delta layer if we need to produce a layer of a same key-lsn.
-                //
-                // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
-                // For example, consider the case where a single delta with range [0x10,0x50) exists.
-                // And we have branches at LSN 0x10, 0x20, 0x30.
-                // Then we delete branch @ 0x20.
-                // Bottom-most compaction may now delete the delta [0x20,0x30).
-                // And that wouldnt' change the shape of the layer.
-                //
-                // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
-                // That's why it's safe to skip.
-                let guard = tline.layers.read().await;
-
-                if guard.contains_key(&delta_key) {
-                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
-                    drop(guard);
-                    if layer_generation == tline.generation {
-                        stats.discard_delta_layer();
-                        // TODO: depending on whether we design this compaction process to run along with
-                        // other compactions, there could be layer map modifications after we drop the
-                        // layer guard, and in case it creates duplicated layer key, we will still error
-                        // in the end.
-                        info!(
-                            key=%delta_key,
-                            ?layer_generation,
-                            "discard delta layer due to duplicated layer in the same generation"
-                        );
-                        return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
-                    }
-                }
-            }
-
            let mut delta_layer_writer = DeltaLayerWriter::new(
                tline.conf,
                tline.timeline_id,
                tline.tenant_shard_id,
-                delta_key.key_range.start,
+                deltas.first().unwrap().0,
                lowest_retain_lsn..end_lsn,
                ctx,
            )
            .await?;
+            let key_end = deltas.last().unwrap().0.next();
            for (key, lsn, val) in deltas {
                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
            }
-
-            stats.produce_delta_layer(delta_layer_writer.size());
-            if dry_run {
-                return Ok(None);
-            }
-
-            let (desc, path) = delta_layer_writer
-                .finish(delta_key.key_range.end, ctx)
-                .await?;
-            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
-            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
+            let delta_layer = delta_layer_writer.finish(key_end, tline, ctx).await?;
+            Ok(Some(delta_layer))
        }

-        // Hack the key range to be min..(max-1). Otherwise, the image layer will be
-        // interpreted as an L0 delta layer.
-        let hack_image_layer_range = {
-            let mut end_key = Key::MAX;
-            end_key.field6 -= 1;
-            Key::MIN..end_key
-        };
-
-        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
-        // when some condition meet.
-        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
-            Some(
-                ImageLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    &hack_image_layer_range, // covers the full key range
-                    lowest_retain_lsn,
-                    ctx,
-                )
-                .await?,
-            )
-        } else {
-            None
-        };
-
-        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
-        ///
-        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
-        /// is needed for reconstruction. This should be fixed in the future.
-        ///
-        /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
-        /// images.
-        async fn get_ancestor_image(
-            tline: &Arc<Timeline>,
-            key: Key,
-            ctx: &RequestContext,
-        ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
-            if tline.ancestor_timeline.is_none() {
-                return Ok(None);
-            };
-            // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
-            // as much existing code as possible.
-            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
-            Ok(Some((key, tline.ancestor_lsn, img)))
-        }
-        let image_layer_key = PersistentLayerKey {
-            key_range: hack_image_layer_range,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
-            is_delta: false,
-        };
-
-        // Like with delta layers, it can happen that we re-produce an already existing image layer.
-        // This could happen when a user triggers force compaction and image generation. In this case,
-        // it's always safe to rewrite the layer.
-        let discard_image_layer = {
-            let guard = self.layers.read().await;
-            if guard.contains_key(&image_layer_key) {
-                let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
-                drop(guard);
-                if layer_generation == self.generation {
-                    // TODO: depending on whether we design this compaction process to run along with
-                    // other compactions, there could be layer map modifications after we drop the
-                    // layer guard, and in case it creates duplicated layer key, we will still error
-                    // in the end.
-                    info!(
-                        key=%image_layer_key,
-                        ?layer_generation,
-                        "discard image layer due to duplicated layer key in the same generation",
-                    );
-                    true
-                } else {
-                    false
-                }
-            } else {
-                false
-            }
-        };
-
-        // Actually, we can decide not to write to the image layer at all at this point because
-        // the key and LSN range are determined. However, to keep things simple here, we still
-        // create this writer, and discard the writer in the end.
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(Key::MIN..Key::MAX), // covers the full key range
+            lowest_retain_lsn,
+            ctx,
+        )
+        .await?;

        let mut delta_values = Vec::new();
        let delta_split_points = delta_split_points.into_iter().collect_vec();
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
        while let Some((key, lsn, val)) = merge_iter.next().await? {
-            if cancel.is_cancelled() {
-                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
-            }
-            match val {
-                Value::Image(_) => stat.visit_image_key(&val),
-                Value::WalRecord(_) => stat.visit_wal_key(&val),
-            }
            if last_key.is_none() || last_key.as_ref() == Some(&key) {
                if last_key.is_none() {
                    last_key = Some(key);
@@ -1878,7 +1455,6 @@ impl Timeline {
                accumulated_values.push((key, lsn, val));
            } else {
                let last_key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited();
                let retention = self
                    .generate_key_retention(
                        *last_key,
@@ -1886,18 +1462,11 @@ impl Timeline {
                        gc_cutoff,
                        &retain_lsns_below_horizon,
                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
                    )
                    .await?;
                // Put the image into the image layer. Currently we have a single big layer for the compaction.
                retention
-                    .pipe_to(
-                        *last_key,
-                        &mut delta_values,
-                        image_layer_writer.as_mut(),
-                        &mut stat,
-                        ctx,
-                    )
+                    .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
                    .await?;
                delta_layers.extend(
                    flush_deltas(
@@ -1908,9 +1477,6 @@ impl Timeline {
                        self,
                        lowest_retain_lsn,
                        ctx,
-                        &mut stat,
-                        dry_run,
-                        false,
                    )
                    .await?,
                );
@@ -1922,7 +1488,6 @@ impl Timeline {

        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
-        stat.on_unique_key_visited();
        let retention = self
            .generate_key_retention(
                last_key,
@@ -1930,18 +1495,11 @@ impl Timeline {
                gc_cutoff,
                &retain_lsns_below_horizon,
                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
            )
            .await?;
        // Put the image into the image layer. Currently we have a single big layer for the compaction.
        retention
-            .pipe_to(
-                last_key,
-                &mut delta_values,
-                image_layer_writer.as_mut(),
-                &mut stat,
-                ctx,
-            )
+            .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
            .await?;
        delta_layers.extend(
            flush_deltas(
@@ -1952,71 +1510,27 @@ impl Timeline {
                self,
                lowest_retain_lsn,
                ctx,
-                &mut stat,
-                dry_run,
-                true,
            )
            .await?,
        );
-        assert!(delta_values.is_empty(), "unprocessed keys");
-
-        let image_layer = if discard_image_layer {
-            stat.discard_image_layer();
-            None
-        } else if let Some(writer) = image_layer_writer {
-            stat.produce_image_layer(writer.size());
-            if !dry_run {
-                Some(writer.finish(self, ctx).await?)
-            } else {
-                None
-            }
-        } else {
-            None
-        };
-
-        info!(
-            "gc-compaction statistics: {}",
-            serde_json::to_string(&stat)?
-        );
-
-        if dry_run {
-            return Ok(());
-        }

+        let image_layer = image_layer_writer.finish(self, ctx).await?;
        info!(
            "produced {} delta layers and {} image layers",
            delta_layers.len(),
-            if image_layer.is_some() { 1 } else { 0 }
+            1
        );
        let mut compact_to = Vec::new();
-        let mut keep_layers = HashSet::new();
-        for action in delta_layers {
-            match action {
-                FlushDeltaResult::CreateResidentLayer(layer) => {
-                    compact_to.push(layer);
-                }
-                FlushDeltaResult::KeepLayer(l) => {
-                    keep_layers.insert(l);
-                }
-            }
-        }
-        if discard_image_layer {
-            keep_layers.insert(image_layer_key);
-        }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        compact_to.extend(image_layer);
-
+        compact_to.extend(delta_layers);
+        compact_to.push(image_layer);
        // Step 3: Place back to the layer map.
        {
            let mut guard = self.layers.write().await;
            guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
        };
+
        self.remote_client
            .schedule_compaction_update(&layer_selection, &compact_to)?;
-
-        drop(gc_lock);
-
        Ok(())
    }
 }
@@ -2213,9 +1727,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
            ))
        });

-        let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
-        let new_delta_layer =
-            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
+        let new_delta_layer = writer
+            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
+            .await?;

        self.new_deltas.push(new_delta_layer);
        Ok(())
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,19 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    // Always ensure the lock order is compaction -> gc.
-    let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
-        compaction_lock,
-        "acquires compaction lock",
-        std::time::Duration::from_secs(5),
-    )
-    .await;
-
-    let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
-        gc_lock,
-        "acquires gc lock",
+    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
+    let guards = crate::timed(
+        guards,
+        "acquire gc and compaction locks",
        std::time::Duration::from_secs(5),
    )
    .await;
@@ -116,8 +107,7 @@ pub(super) async fn delete_local_timeline_directory(
        .context("fsync_pre_mark_remove")?;

    info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
+    drop(guards);

    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -216,10 +206,11 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all)]
+    #[instrument(skip_all, fields(%inplace))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
+        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();

@@ -230,8 +221,6 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

-        tenant.gc_block.before_delete(&timeline);
-
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
@@ -246,7 +235,11 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -488,12 +488,10 @@ async fn copy_lsn_prefix(
        // reuse the key instead of adding more holes between layers by using the real
        // highest key in the layer.
        let reused_highest_key = layer.layer_desc().key_range.end;
-        let (desc, path) = writer
-            .finish(reused_highest_key, ctx)
+        let copied = writer
+            .finish(reused_highest_key, target_timeline, ctx)
            .await
            .map_err(CopyDeltaPrefix)?;
-        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
-            .map_err(CopyDeltaPrefix)?;

        tracing::debug!(%layer, %copied, "new layer produced");

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,7 +225,7 @@ impl Timeline {
                    continue;
                }

-                let last_activity_ts = layer.latest_activity();
+                let last_activity_ts = layer.access_stats().latest_activity();

                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,967 +0,0 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
-//!
-//! # Motivation
-//!
-//! On a single page service connection, we're typically serving a single TenantTimelineId.
-//!
-//! Without sharding, there is a single Timeline object to which we dispatch
-//! all requests. For example, a getpage request gets dispatched to the
-//! Timeline::get method of the Timeline object that represents the
-//! (tenant,timeline) of that connection.
-//!
-//! With sharding, for each request that comes in on the connection,
-//! we first have to perform shard routing based on the requested key (=~ page number).
-//! The result of shard routing is a Timeline object.
-//! We then dispatch the request to that Timeline object.
-//!
-//! Regardless of whether the tenant is sharded or not, we want to ensure that
-//! we hold the Timeline gate open while we're invoking the method on the
-//! Timeline object.
-//!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
-//!
-//! Regardless of how we accomplish the above, it should not
-//! prevent the Timeline from shutting down promptly.
-//!
-//! # Design
-//!
-//! There are three user-facing data structures:
-//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
-//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
-//!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
-//!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
-//!
-//! To dispatch a request, the page service connection calls `Cache::get`.
-//!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
-//!
-//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
-//!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
-//!
-//! # Memory Management / How The Reference Cycle Is Broken
-//!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
-//!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
-//!
-//! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
-//!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
-//! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
-//!
-//! # Fast Path for Shard Routing
-//!
-//! The `Cache` has a fast path for shard routing to avoid calling into
-//! the tenant manager for every request.
-//!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
-//!
-//! The current implementation uses the first entry in the hash map
-//! to determine the `ShardParameters` and derive the correct
-//! `ShardIndex` for the requested key.
-//!
-//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
-//!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
-//! it's a hit.
-//!
-//! ## Cache invalidation
-//!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
-//! The only reasons why an entry in the cache can become stale are:
-//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
-//!    being detached, timeline or shard deleted, or pageserver is shutting down.
-//! 2. We're doing a shard split and new traffic should be routed to the child shards.
-//!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
-//! timeline has shut down, and when that happens, we remove the entry from the cache.
-//!
-//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
-//! to the parent shard during a shard split. Eventually, the shard split task will
-//! shut down the parent => case (1).
-
-use std::collections::hash_map;
-use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::sync::Weak;
-
-use pageserver_api::shard::ShardIdentity;
-use tracing::instrument;
-use tracing::trace;
-use utils::id::TimelineId;
-use utils::shard::ShardIndex;
-use utils::shard::ShardNumber;
-
-use crate::tenant::mgr::ShardSelector;
-
-/// The requirement for Debug is so that #[derive(Debug)] works in some places.
-pub(crate) trait Types: Sized + std::fmt::Debug {
-    type TenantManagerError: Sized + std::fmt::Debug;
-    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
-}
-
-/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
-/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
-/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
-struct CacheId(u64);
-
-impl CacheId {
-    fn next() -> Self {
-        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("CacheId::new() returned 0, overflow");
-        }
-        Self(id)
-    }
-}
-
-/// See module-level comment.
-pub(crate) struct Cache<T: Types> {
-    id: CacheId,
-    map: Map<T>,
-}
-
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
-
-impl<T: Types> Default for Cache<T> {
-    fn default() -> Self {
-        Self {
-            id: CacheId::next(),
-            map: Default::default(),
-        }
-    }
-}
-
-#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
-pub(crate) struct ShardTimelineId {
-    pub(crate) shard_index: ShardIndex,
-    pub(crate) timeline_id: TimelineId,
-}
-
-/// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
-///
-/// See module-level comment for details.
-pub struct PerTimelineState<T: Types> {
-    // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
-}
-
-impl<T: Types> Default for PerTimelineState<T> {
-    fn default() -> Self {
-        Self {
-            handles: Mutex::new(Some(Default::default())),
-        }
-    }
-}
-
-/// Abstract view of [`crate::tenant::mgr`], for testability.
-pub(crate) trait TenantManager<T: Types> {
-    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
-    /// Errors are returned as [`GetError::TenantManager`].
-    async fn resolve(
-        &self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> Result<T::Timeline, T::TenantManagerError>;
-}
-
-/// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
-    fn shard_timeline_id(&self) -> ShardTimelineId;
-    fn get_shard_identity(&self) -> &ShardIdentity;
-    fn per_timeline_state(&self) -> &PerTimelineState<T>;
-}
-
-/// Errors returned by [`Cache::get`].
-#[derive(Debug)]
-pub(crate) enum GetError<T: Types> {
-    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
-    PerTimelineStateShutDown,
-}
-
-/// Internal type used in [`Cache::get`].
-enum RoutingResult<T: Types> {
-    FastPath(Handle<T>),
-    SlowPath(ShardTimelineId),
-    NeedConsultTenantManager,
-}
-
-impl<T: Types> Cache<T> {
-    /// See module-level comment for details.
-    ///
-    /// Does NOT check for the shutdown state of [`Types::Timeline`].
-    /// Instead, the methods of [`Types::Timeline`] that are invoked through
-    /// the [`Handle`] are responsible for checking these conditions
-    /// and if so, return an error that causes the page service to
-    /// close the connection.
-    #[instrument(level = "trace", skip_all)]
-    pub(crate) async fn get(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
-            let routing_state = self.shard_routing(timeline_id, shard_selector);
-            match routing_state {
-                RoutingResult::FastPath(handle) => return Ok(handle),
-                RoutingResult::SlowPath(key) => match self.map.get(&key) {
-                    Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
-                            trace!("handle cache stale");
-                            self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
-                        }
-                    },
-                    None => ShardSelector::Known(key.shard_index),
-                },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
-            }
-        };
-        self.get_miss(timeline_id, miss, tenant_manager).await
-    }
-
-    #[inline(always)]
-    fn shard_routing(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> RoutingResult<T> {
-        loop {
-            // terminates because when every iteration we remove an element from the map
-            let Some((first_key, first_handle)) = self.map.iter().next() else {
-                return RoutingResult::NeedConsultTenantManager;
-            };
-            let Some(first_handle) = first_handle.upgrade() else {
-                // TODO: dedup with get()
-                trace!("handle cache stale");
-                let first_key_owned = *first_key;
-                self.map.remove(&first_key_owned).unwrap();
-                continue;
-            };
-
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
-            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_handle_shard_identity.count,
-            };
-
-            let need_idx = match shard_selector {
-                ShardSelector::Page(key) => {
-                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
-                }
-                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
-                ShardSelector::Known(shard_idx) => shard_idx,
-            };
-            let need_shard_timeline_id = ShardTimelineId {
-                shard_index: need_idx,
-                timeline_id,
-            };
-            let first_handle_shard_timeline_id = ShardTimelineId {
-                shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
-            };
-
-            if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
-            } else {
-                return RoutingResult::SlowPath(need_shard_timeline_id);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    #[inline(always)]
-    async fn get_miss(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        match tenant_manager.resolve(timeline_id, shard_selector).await {
-            Ok(timeline) => {
-                let key = timeline.shard_timeline_id();
-                match &shard_selector {
-                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
-                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
-                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
-                }
-
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
-                trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
-                    let mut lock_guard = timeline
-                        .per_timeline_state()
-                        .handles
-                        .lock()
-                        .expect("mutex poisoned");
-                    match &mut *lock_guard {
-                        Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
-                            assert!(replaced.is_none(), "some earlier code left a stale handle");
-                            match self.map.entry(key) {
-                                hash_map::Entry::Occupied(_o) => {
-                                    // This cannot not happen because
-                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
-                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
-                                    //    while we were waiting for the tenant manager.
-                                    unreachable!()
-                                }
-                                hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
-                                }
-                            }
-                        }
-                        None => {
-                            return Err(GetError::PerTimelineStateShutDown);
-                        }
-                    }
-                };
-                Ok(Handle(handle))
-            }
-            Err(e) => Err(GetError::TenantManager(e)),
-        }
-    }
-}
-
-impl<T: Types> PerTimelineState<T> {
-    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
-    /// to the [`Types::Timeline`] that embeds this per-timeline state.
-    /// Even if [`TenantManager::resolve`] would still resolve to it.
-    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
-    /// That's ok because they're short-lived. See module-level comment for details.
-    #[instrument(level = "trace", skip_all)]
-    pub(super) fn shutdown(&self) {
-        let handles = self
-            .handles
-            .lock()
-            .expect("mutex poisoned")
-            // NB: this .take() sets locked to None.
-            // That's what makes future `Cache::get` misses fail.
-            // Cache hits are taken care of below.
-            .take();
-        let Some(handles) = handles else {
-            trace!("already shut down");
-            return;
-        };
-        for handle in handles.values() {
-            // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
-        }
-        drop(handles);
-    }
-}
-
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
-// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
-impl<T: Types> Drop for Cache<T> {
-    fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver_api::{
-        key::{rel_block_to_key, Key, DBDIR_KEY},
-        models::ShardParameters,
-        reltag::RelTag,
-        shard::ShardStripeSize,
-    };
-    use utils::shard::ShardCount;
-
-    use super::*;
-
-    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
-
-    #[derive(Debug)]
-    struct TestTypes;
-    impl Types for TestTypes {
-        type TenantManagerError = anyhow::Error;
-        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
-    }
-
-    struct StubManager {
-        shards: Vec<Arc<StubTimeline>>,
-    }
-
-    struct StubTimeline {
-        gate: utils::sync::gate::Gate,
-        id: TimelineId,
-        shard: ShardIdentity,
-        per_timeline_state: PerTimelineState<TestTypes>,
-        myself: Weak<StubTimeline>,
-    }
-
-    impl StubTimeline {
-        fn getpage(&self) {
-            // do nothing
-        }
-    }
-
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
-        fn shard_timeline_id(&self) -> ShardTimelineId {
-            ShardTimelineId {
-                shard_index: self.shard.shard_index(),
-                timeline_id: self.id,
-            }
-        }
-
-        fn get_shard_identity(&self) -> &ShardIdentity {
-            &self.shard
-        }
-
-        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
-            &self.per_timeline_state
-        }
-    }
-
-    impl TenantManager<TestTypes> for StubManager {
-        async fn resolve(
-            &self,
-            timeline_id: TimelineId,
-            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
-            for timeline in &self.shards {
-                if timeline.id == timeline_id {
-                    match &shard_selector {
-                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Zero => continue,
-                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Page(_) => continue,
-                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Known(_) => continue,
-                    }
-                }
-            }
-            anyhow::bail!("not found")
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_timeline_shutdown() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        //
-        // fill the cache
-        //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        let handle: Handle<_> = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
-        assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-        drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-
-        //
-        // demonstrate that Handle holds up gate closure
-        // but shutdown prevents new handles from being handed out
-        //
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("cache and per-timeline handler state keep cache open");
-            }
-            _ = tokio::time::sleep(FOREVER) => {
-                // NB: first poll of close() makes it enter closing state
-            }
-        }
-
-        let handle = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-
-        // SHUTDOWN
-        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
-
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
-        assert_eq!(
-            cache.map.len(),
-            1,
-            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
-
-        // this handle is perfectly usable
-        handle.getpage();
-
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
-        assert_eq!(
-            cache.map.len(),
-            0,
-            "first access after shutdown cleans up the Weak's from the cache"
-        );
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-
-        drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        // closing gate succeeds after dropping handle
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-
-        // map gets cleaned on next lookup
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 0);
-
-        // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
-        drop(shard0);
-        drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
-    }
-
-    #[tokio::test]
-    async fn test_multiple_timelines_and_deletion() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_a = TimelineId::generate();
-        let timeline_b = TimelineId::generate();
-        assert_ne!(timeline_a, timeline_b);
-        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_a,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_b,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mut mgr = StubManager {
-            shards: vec![timeline_a.clone(), timeline_b.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        assert_eq!(cache.map.len(), 2);
-
-        // delete timeline A
-        timeline_a.per_timeline_state.shutdown();
-        mgr.shards.retain(|t| t.id != timeline_a.id);
-        assert!(
-            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
-                .await
-                .is_err(),
-            "broken StubManager implementation"
-        );
-
-        assert_eq!(
-            cache.map.len(),
-            2,
-            "cache still has a Weak handle to Timeline A"
-        );
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
-
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we still have it");
-    }
-
-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
-        rel_block_to_key(
-            RelTag {
-                spcnode: 1663,
-                dbnode: 208101,
-                relnode: 2620,
-                forknum: 0,
-            },
-            shard.0 as u32 * params.stripe_size.0,
-        )
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_shard_split() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let parent = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_params = ShardParameters {
-            count: ShardCount(2),
-            stripe_size: ShardStripeSize::default(),
-        };
-        let child0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child1 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        // fill the cache with the parent
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![parent.clone()],
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent first"
-            );
-            drop(handle);
-        }
-
-        //
-        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
-        //
-
-        // while we haven't shut down the parent, the cache will return the cached parent, even
-        // if the tenant manager returns the child
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent"
-            );
-            drop(handle);
-        }
-
-        let parent_handle = cache
-            .get(
-                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
-                &StubManager {
-                    shards: vec![parent.clone()],
-                },
-            )
-            .await
-            .expect("we have it");
-        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
-
-        // invalidate the cache
-        parent.per_timeline_state.shutdown();
-
-        // the cache will now return the child, even though the parent handle still exists
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(
-                    &handle.myself,
-                    &child_shards_by_shard_number[i as usize].myself
-                ),
-                "mgr returns child"
-            );
-            drop(handle);
-        }
-
-        // all the while the parent handle kept the parent gate open
-        tokio::select! {
-            _ = parent_handle.gate.close() => {
-                panic!("parent handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-        drop(parent_handle);
-        tokio::select! {
-            _ = parent.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("parent handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_connection_handler_exit() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
-        for _ in 0..10 {
-            let mut cache = Cache::<TestTypes>::default();
-            let handle = {
-                let handle = cache
-                    .get(timeline_id, ShardSelector::Page(key), &mgr)
-                    .await
-                    .expect("we have the timeline");
-                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-                handle
-            };
-            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
-        }
-
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -35,10 +35,6 @@ impl LayerManager {
        self.layer_fmgr.get_from_desc(desc)
    }

-    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
-        self.layer_fmgr.get_from_key(desc)
-    }
-
    /// Get an immutable reference to the layer map.
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
@@ -259,10 +255,13 @@ impl LayerManager {
                new_layer.layer_desc().lsn_range
            );

-            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
            // always marking rewritten layers as visible.
-            new_layer.as_ref().set_visibility(old_layer.visibility());
+            new_layer
+                .as_ref()
+                .access_stats()
+                .set_visibility(old_layer.access_stats().visibility());

            // Safety: we may never rewrite the same file in-place.  Callers are responsible
            // for ensuring that they only rewrite layers after something changes the path,
@@ -366,20 +365,16 @@ impl<T> Default for LayerFileManager<T> {
 }

 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
-            .get(key)
-            .with_context(|| format!("get layer from key: {}", key))
+            .get(&desc.key())
+            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
            .expect("not found")
            .clone()
    }

-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
-        self.get_from_key(&desc.key())
-    }
-
    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
        self.0.contains_key(key)
    }
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,10 +122,6 @@ impl CurrentLogicalSize {
            Self::Exact(_) => Accuracy::Exact,
        }
    }
-
-    pub(crate) fn is_exact(&self) -> bool {
-        matches!(self, Self::Exact(_))
-    }
 }

 impl LogicalSize {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -241,9 +241,6 @@ impl PostgresRedoManager {

    /// Shut down the WAL redo manager.
    ///
-    /// Returns `true` if this call was the one that initiated shutdown.
-    /// `true` may be observed by no caller if the first caller stops polling.
-    ///
    /// After this future completes
    /// - no redo process is running
    /// - no new redo process will be spawned
@@ -253,32 +250,22 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn shutdown(&self) -> bool {
+    pub async fn shutdown(&self) {
        // prevent new processes from being spawned
-        let maybe_permit = match self.redo_process.get_or_init_detached().await {
+        let permit = match self.redo_process.get_or_init_detached().await {
            Ok(guard) => {
-                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
-                    None
-                } else {
-                    let (proc, permit) = guard.take_and_deinit();
-                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
-                    Some(permit)
-                }
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
            }
-            Err(permit) => Some(permit),
-        };
-        let it_was_us = if let Some(permit) = maybe_permit {
-            self.redo_process
-                .set(ProcessOnceCell::ManagerShutDown, permit);
-            true
-        } else {
-            false
+            Err(permit) => permit,
        };
+        self.redo_process
+            .set(ProcessOnceCell::ManagerShutDown, permit);
        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
        // for the underlying process.
        self.launched_processes.close().await;
-        it_was_us
    }

    /// This type doesn't have its own background task to check for idleness: we
--- a/pageserver/test_data/indices/mixed_workload/README.md
+++ b/pageserver/test_data/indices/mixed_workload/README.md
@@ -1,7 +0,0 @@
-
-# This was captured from one shard of a large tenant in staging.
-
-# It has a mixture of deltas and image layers, >1000 layers in total.
-
-# This is suitable for general smoke tests that want an index which is not
-# trivially small, but doesn't contain weird/pathological cases.
--- a/pageserver/test_data/indices/mixed_workload/index_part.json
+++ b/pageserver/test_data/indices/mixed_workload/index_part.json
--- a/poetry.lock
+++ b/poetry.lock
@@ -870,96 +870,6 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}

-[[package]]
-name = "clickhouse-connect"
-version = "0.7.17"
-description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
-optional = false
-python-versions = "~=3.8"
-files = [
-    {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
-]
-
-[package.dependencies]
-certifi = "*"
-lz4 = "*"
-pytz = "*"
-urllib3 = ">=1.26"
-zstandard = "*"
-
-[package.extras]
-arrow = ["pyarrow"]
-numpy = ["numpy"]
-orjson = ["orjson"]
-pandas = ["pandas"]
-sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
-tzlocal = ["tzlocal (>=4.0)"]
-
 [[package]]
 name = "colorama"
 version = "0.4.5"
@@ -1514,20 +1424,6 @@ files = [
 [package.dependencies]
 six = "*"

-[[package]]
-name = "kafka-python"
-version = "2.0.2"
-description = "Pure Python client for Apache Kafka"
-optional = false
-python-versions = "*"
-files = [
-    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
-    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
-]
-
-[package.extras]
-crc32c = ["crc32c"]
-
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -1574,56 +1470,6 @@ files = [
    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]

-[[package]]
-name = "lz4"
-version = "4.3.3"
-description = "LZ4 Bindings for Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
-    {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
-    {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
-    {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
-    {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
-    {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
-    {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
-    {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
-    {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
-    {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
-    {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
-    {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
-    {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
-]
-
-[package.extras]
-docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
-flake8 = ["flake8"]
-tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
-
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -2515,17 +2361,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"

-[[package]]
-name = "pytz"
-version = "2024.1"
-description = "World timezone definitions, modern and historical"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
-    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -3371,4 +3206,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -5,6 +5,4 @@ pub use limit_algorithm::{
 };
 pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
 mod leaky_bucket;
-pub use leaky_bucket::{
-    EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
-};
+pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -1,6 +1,7 @@
 use std::{
    hash::Hash,
    sync::atomic::{AtomicUsize, Ordering},
+    time::Duration,
 };

 use ahash::RandomState;
@@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;

 pub struct LeakyBucketRateLimiter<Key> {
    map: DashMap<Key, LeakyBucketState, RandomState>,
-    config: LeakyBucketConfig,
+    config: LeakyBucketConfigInner,
    access_count: AtomicUsize,
 }

@@ -29,7 +30,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
    pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
        Self {
            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
-            config,
+            config: config.into(),
            access_count: AtomicUsize::new(0),
        }
    }
@@ -42,10 +43,10 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
            self.do_gc(now);
        }

-        let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
-            time: now,
-            filled: 0.0,
-        });
+        let mut entry = self
+            .map
+            .entry(key)
+            .or_insert_with(|| LeakyBucketState::new(now));

        entry.check(&self.config, now, n as f64)
    }
@@ -59,7 +60,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
        let shard = thread_rng().gen_range(0..n);
        self.map.shards()[shard]
            .write()
-            .retain(|_, value| !value.get_mut().update(&self.config, now));
+            .retain(|_, value| value.get().should_retain(now));
    }
 }

@@ -68,11 +69,6 @@ pub struct LeakyBucketConfig {
    pub max: f64,
 }

-pub struct LeakyBucketState {
-    filled: f64,
-    time: Instant,
-}
-
 impl LeakyBucketConfig {
    pub fn new(rps: f64, max: f64) -> Self {
        assert!(rps > 0.0, "rps must be positive");
@@ -81,40 +77,76 @@ impl LeakyBucketConfig {
    }
 }

-impl LeakyBucketState {
-    pub fn new() -> Self {
+struct LeakyBucketConfigInner {
+    /// "time cost" of a single request unit.
+    /// loosely represents how long it takes to handle a request unit in active CPU time.
+    time_cost: Duration,
+    bucket_width: Duration,
+}
+
+impl From<LeakyBucketConfig> for LeakyBucketConfigInner {
+    fn from(config: LeakyBucketConfig) -> Self {
+        // seconds_per_request = 1/(request_per_second)
+        let spr = config.rps.recip();
        Self {
-            filled: 0.0,
-            time: Instant::now(),
+            time_cost: Duration::from_secs_f64(spr),
+            bucket_width: Duration::from_secs_f64(config.max * spr),
        }
    }
-
-    /// updates the timer and returns true if the bucket is empty
-    fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
-        let drain = now.duration_since(self.time);
-        let drain = drain.as_secs_f64() * info.rps;
-
-        self.filled = (self.filled - drain).clamp(0.0, info.max);
-        self.time = now;
-
-        self.filled == 0.0
-    }
-
-    pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
-        self.update(info, now);
-
-        if self.filled + n > info.max {
-            return false;
-        }
-        self.filled += n;
-
-        true
-    }
 }

-impl Default for LeakyBucketState {
-    fn default() -> Self {
-        Self::new()
+struct LeakyBucketState {
+    /// Bucket is represented by `start..end` where `start = end - config.bucket_width`.
+    ///
+    /// At any given time, `end - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
+    /// Adding `n` tokens to the bucket is done by moving `end` forward by `n * config.time_cost`.
+    /// If `now < start`, the bucket is considered filled and cannot accept any more tokens.
+    /// Draining the bucket will happen naturally as `now` moves forward.
+    ///
+    /// Let `n` be some "time cost" for the request,
+    /// If now is after end, the bucket is empty and the end is reset to now,
+    /// If now is within the `bucket window + n`, we are within time budget.
+    /// If now is before the `bucket window + n`, we have run out of budget.
+    ///
+    /// This is inspired by the generic cell rate algorithm (GCRA) and works
+    /// exactly the same as a leaky-bucket.
+    end: Instant,
+}
+
+impl LeakyBucketState {
+    fn new(now: Instant) -> Self {
+        Self { end: now }
+    }
+
+    fn should_retain(&self, now: Instant) -> bool {
+        // if self.end is after now, the bucket is not empty
+        now < self.end
+    }
+
+    fn check(&mut self, config: &LeakyBucketConfigInner, now: Instant, n: f64) -> bool {
+        let start = self.end - config.bucket_width;
+
+        let n = config.time_cost.mul_f64(n);
+
+        //       start          end
+        //       |     start+n  |     end+n
+        //       |   /          |   /
+        // ------{o-[---------o-}--]----o----
+        //   now1 ^      now2 ^         ^ now3
+        //
+        // at now1, the bucket would be completely filled if we add n tokens.
+        // at now2, the bucket would be partially filled if we add n tokens.
+        // at now3, the bucket would start completely empty before we add n tokens.
+
+        if self.end + n <= now {
+            self.end = now + n;
+            true
+        } else if start + n <= now {
+            self.end += n;
+            true
+        } else {
+            false
+        }
    }
 }

@@ -124,47 +156,50 @@ mod tests {

    use tokio::time::Instant;

-    use super::{LeakyBucketConfig, LeakyBucketState};
+    use super::{LeakyBucketConfig, LeakyBucketConfigInner, LeakyBucketState};

    #[tokio::test(start_paused = true)]
    async fn check() {
-        let info = LeakyBucketConfig::new(500.0, 2000.0);
-        let mut bucket = LeakyBucketState::new();
+        let config: LeakyBucketConfigInner = LeakyBucketConfig::new(500.0, 2000.0).into();
+        assert_eq!(config.time_cost, Duration::from_millis(2));
+        assert_eq!(config.bucket_width, Duration::from_secs(4));
+
+        let mut bucket = LeakyBucketState::new(Instant::now());

        // should work for 2000 requests this second
        for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            assert!(bucket.check(&config, Instant::now(), 1.0));
        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-        assert_eq!(bucket.filled, 2000.0);
+        assert!(!bucket.check(&config, Instant::now(), 1.0));
+        assert_eq!(bucket.end - Instant::now(), config.bucket_width);

        // in 1ms we should drain 0.5 tokens.
        // make sure we don't lose any tokens
        tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        assert!(!bucket.check(&config, Instant::now(), 1.0));
        tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(bucket.check(&info, Instant::now(), 1.0));
+        assert!(bucket.check(&config, Instant::now(), 1.0));

        // in 10ms we should drain 5 tokens
        tokio::time::advance(Duration::from_millis(10)).await;
        for _ in 0..5 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            assert!(bucket.check(&config, Instant::now(), 1.0));
        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        assert!(!bucket.check(&config, Instant::now(), 1.0));

        // in 10s we should drain 5000 tokens
        // but cap is only 2000
        tokio::time::advance(Duration::from_secs(10)).await;
        for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            assert!(bucket.check(&config, Instant::now(), 1.0));
        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        assert!(!bucket.check(&config, Instant::now(), 1.0));

        // should sustain 500rps
        for _ in 0..2000 {
            tokio::time::advance(Duration::from_millis(10)).await;
            for _ in 0..5 {
-                assert!(bucket.check(&info, Instant::now(), 1.0));
+                assert!(bucket.check(&config, Instant::now(), 1.0));
            }
        }
    }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,8 @@
 [tool.poetry]
+name = "neon"
+version = "0.1.0"
 description = ""
 authors = []
-package-mode = false

 [tool.poetry.dependencies]
 python = "^3.9"
@@ -40,8 +41,6 @@ zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
-clickhouse-connect = "^0.7.16"
-kafka-python = "^2.0.2"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -75,7 +74,6 @@ module = [
    "allure.*",
    "allure_commons.*",
    "allure_pytest.*",
-    "kafka.*",
 ]
 ignore_missing_imports = true

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -170,6 +170,11 @@ struct Args {
    /// still needed for existing replication connection.
    #[arg(long)]
    walsenders_keep_horizon: bool,
+    /// Enable partial backup. If disabled, safekeeper will not upload partial
+    /// segments to remote storage.
+    /// TODO: now partial backup is always enabled, remove this flag.
+    #[arg(long)]
+    partial_backup_enabled: bool,
    /// Controls how long backup will wait until uploading the partial segment.
    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
    partial_backup_timeout: Duration,
@@ -342,6 +347,7 @@ async fn main() -> anyhow::Result<()> {
        sk_auth_token,
        current_thread_runtime: args.current_thread_runtime,
        walsenders_keep_horizon: args.walsenders_keep_horizon,
+        partial_backup_enabled: true,
        partial_backup_timeout: args.partial_backup_timeout,
        disable_periodic_broker_push: args.disable_periodic_broker_push,
        enable_offload: args.enable_offload,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -21,7 +21,6 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod patch_control_file;
 pub mod pull_timeline;
-pub mod rate_limit;
 pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
@@ -54,7 +53,6 @@ pub mod defaults {
    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
    pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
    pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
-    pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;

    // By default, our required residency before eviction is the same as the period that passes
    // before uploading a partial segment, so that in normal operation the eviction can happen
@@ -93,6 +91,7 @@ pub struct SafeKeeperConf {
    pub sk_auth_token: Option<SecretString>,
    pub current_thread_runtime: bool,
    pub walsenders_keep_horizon: bool,
+    pub partial_backup_enabled: bool,
    pub partial_backup_timeout: Duration,
    pub disable_periodic_broker_push: bool,
    pub enable_offload: bool,
@@ -136,6 +135,7 @@ impl SafeKeeperConf {
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
            walsenders_keep_horizon: false,
+            partial_backup_enabled: false,
            partial_backup_timeout: Duration::from_secs(0),
            disable_periodic_broker_push: false,
            enable_offload: false,
--- a/safekeeper/src/rate_limit.rs
+++ b/safekeeper/src/rate_limit.rs
@@ -1,49 +0,0 @@
-use std::sync::Arc;
-
-use rand::Rng;
-
-use crate::metrics::MISC_OPERATION_SECONDS;
-
-/// Global rate limiter for background tasks.
-#[derive(Clone)]
-pub struct RateLimiter {
-    partial_backup: Arc<tokio::sync::Semaphore>,
-    eviction: Arc<tokio::sync::Semaphore>,
-}
-
-impl RateLimiter {
-    /// Create a new rate limiter.
-    /// - `partial_backup_max`: maximum number of concurrent partial backups.
-    /// - `eviction_max`: maximum number of concurrent timeline evictions.
-    pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
-        Self {
-            partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
-            eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
-        }
-    }
-
-    /// Get a permit for partial backup. This will block if the maximum number of concurrent
-    /// partial backups is reached.
-    pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
-        let _timer = MISC_OPERATION_SECONDS
-            .with_label_values(&["partial_permit_acquire"])
-            .start_timer();
-        self.partial_backup
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("semaphore is closed")
-    }
-
-    /// Try to get a permit for timeline eviction. This will return None if the maximum number of
-    /// concurrent timeline evictions is reached.
-    pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
-        self.eviction.clone().try_acquire_owned().ok()
-    }
-}
-
-/// Generate a random duration that is a fraction of the given duration.
-pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
-    let randf64 = rand::thread_rng().gen_range(0.0..1.0);
-    duration.mul_f64(randf64)
-}
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -25,7 +25,6 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

-use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
@@ -37,7 +36,7 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::PartialRemoteSegment;
+use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,6 +5,7 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use std::time::Instant;
 use tokio::{
    fs::File,
    io::{AsyncRead, AsyncWriteExt},
@@ -14,7 +15,6 @@ use utils::crashsafe::durable_rename;

 use crate::{
    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
-    rate_limit::rand_duration,
    timeline_manager::{Manager, StateSnapshot},
    wal_backup,
    wal_backup_partial::{self, PartialRemoteSegment},
@@ -50,6 +50,7 @@ impl Manager {
                .flush_lsn
                .segment_number(self.wal_seg_size)
                == self.last_removed_segno + 1
+            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
    }

    /// Evict the timeline to remote storage.
@@ -111,8 +112,7 @@ impl Manager {
            return;
        }

-        self.evict_not_before =
-            tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
+        self.resident_since = Instant::now();

        info!("successfully restored evicted timeline");
    }
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -23,7 +23,6 @@ use utils::lsn::Lsn;
 use crate::{
    control_file::{FileStorage, Storage},
    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
-    rate_limit::{rand_duration, RateLimiter},
    recovery::recovery_main,
    remove_wal::calc_horizon_lsn,
    safekeeper::Term,
@@ -33,7 +32,7 @@ use crate::{
    timeline_guard::{AccessService, GuardId, ResidenceGuard},
    timelines_set::{TimelineSetGuard, TimelinesSet},
    wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
    SafeKeeperConf,
 };

@@ -186,11 +185,11 @@ pub(crate) struct Manager {

    // misc
    pub(crate) access_service: AccessService,
-    pub(crate) global_rate_limiter: RateLimiter,
+    pub(crate) partial_backup_rate_limiter: RateLimiter,

    // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
    // evict them if they go inactive very soon after being restored.
-    pub(crate) evict_not_before: Instant,
+    pub(crate) resident_since: std::time::Instant,
 }

 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -203,7 +202,7 @@ pub async fn main_task(
    broker_active_set: Arc<TimelinesSet>,
    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
    mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
-    global_rate_limiter: RateLimiter,
+    partial_backup_rate_limiter: RateLimiter,
 ) {
    tli.set_status(Status::Started);

@@ -221,7 +220,7 @@ pub async fn main_task(
        conf,
        broker_active_set,
        manager_tx,
-        global_rate_limiter,
+        partial_backup_rate_limiter,
    )
    .await;

@@ -255,29 +254,9 @@ pub async fn main_task(
            mgr.set_status(Status::UpdatePartialBackup);
            mgr.update_partial_backup(&state_snapshot).await;

-            let now = Instant::now();
-            if mgr.evict_not_before > now {
-                // we should wait until evict_not_before
-                update_next_event(&mut next_event, mgr.evict_not_before);
-            }
-
-            if mgr.conf.enable_offload
-                && mgr.evict_not_before <= now
-                && mgr.ready_for_eviction(&next_event, &state_snapshot)
-            {
-                // check rate limiter and evict timeline if possible
-                match mgr.global_rate_limiter.try_acquire_eviction() {
-                    Some(_permit) => {
-                        mgr.set_status(Status::EvictTimeline);
-                        mgr.evict_timeline().await;
-                    }
-                    None => {
-                        // we can't evict timeline now, will try again later
-                        mgr.evict_not_before =
-                            Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
-                        update_next_event(&mut next_event, mgr.evict_not_before);
-                    }
-                }
+            if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
+                mgr.set_status(Status::EvictTimeline);
+                mgr.evict_timeline().await;
            }
        }

@@ -355,10 +334,11 @@ impl Manager {
        conf: SafeKeeperConf,
        broker_active_set: Arc<TimelinesSet>,
        manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
-        global_rate_limiter: RateLimiter,
+        partial_backup_rate_limiter: RateLimiter,
    ) -> Manager {
        let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
        Manager {
+            conf,
            wal_seg_size: tli.get_wal_seg_size().await,
            walsenders: tli.get_walsenders().clone(),
            state_version_rx: tli.get_state_version_rx(),
@@ -373,10 +353,8 @@ impl Manager {
            partial_backup_uploaded,
            access_service: AccessService::new(manager_tx),
            tli,
-            global_rate_limiter,
-            // to smooth out evictions spike after restart
-            evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
-            conf,
+            partial_backup_rate_limiter,
+            resident_since: std::time::Instant::now(),
        }
    }

@@ -544,8 +522,8 @@ impl Manager {

    /// Spawns partial WAL backup task if needed.
    async fn update_partial_backup(&mut self, state: &StateSnapshot) {
-        // check if WAL backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() {
+        // check if partial backup is enabled and should be started
+        if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled {
            return;
        }

@@ -563,7 +541,7 @@ impl Manager {
        self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
            self.wal_resident_timeline(),
            self.conf.clone(),
-            self.global_rate_limiter.clone(),
+            self.partial_backup_rate_limiter.clone(),
        )));
    }

--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -2,11 +2,10 @@
 //! All timelines should always be present in this map, this is done by loading them
 //! all from the disk on startup and keeping them in memory.

-use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
-use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -32,7 +31,7 @@ struct GlobalTimelinesState {
    conf: Option<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
-    global_rate_limiter: RateLimiter,
+    partial_backup_rate_limiter: RateLimiter,
 }

 // Used to prevent concurrent timeline loading.
@@ -51,7 +50,7 @@ impl GlobalTimelinesState {
        (
            self.get_conf().clone(),
            self.broker_active_set.clone(),
-            self.global_rate_limiter.clone(),
+            self.partial_backup_rate_limiter.clone(),
        )
    }

@@ -86,7 +85,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
        conf: None,
        broker_active_set: Arc::new(TimelinesSet::default()),
        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
-        global_rate_limiter: RateLimiter::new(1, 1),
+        partial_backup_rate_limiter: RateLimiter::new(1),
    })
 });

@@ -100,10 +99,7 @@ impl GlobalTimelines {
        // lock, so use explicit block
        let tenants_dir = {
            let mut state = TIMELINES_STATE.lock().unwrap();
-            state.global_rate_limiter = RateLimiter::new(
-                conf.partial_backup_concurrency,
-                DEFAULT_EVICTION_CONCURRENCY,
-            );
+            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
            state.conf = Some(conf);

            // Iterate through all directories and load tenants for all directories
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,6 +18,8 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.

+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -28,7 +30,6 @@ use utils::lsn::Lsn;

 use crate::{
    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
-    rate_limit::{rand_duration, RateLimiter},
    safekeeper::Term,
    timeline::WalResidentTimeline,
    timeline_manager::StateSnapshot,
@@ -36,6 +37,30 @@ use crate::{
    SafeKeeperConf,
 };

+#[derive(Clone)]
+pub struct RateLimiter {
+    semaphore: Arc<tokio::sync::Semaphore>,
+}
+
+impl RateLimiter {
+    pub fn new(permits: usize) -> Self {
+        Self {
+            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
+        }
+    }
+
+    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_permit_acquire"])
+            .start_timer();
+        self.semaphore
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("semaphore is closed")
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
    /// Upload is in progress. This status should be used only for garbage collection,
@@ -327,7 +352,6 @@ pub async fn main_task(
 ) -> Option<PartialRemoteSegment> {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;
-    let mut first_iteration = true;

    let (_, persistent_state) = tli.get_state().await;
    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
@@ -395,15 +419,6 @@ pub async fn main_task(
            }
        }

-        // smoothing the load after restart, by sleeping for a random time.
-        // if this is not the first iteration, we will wait for the full await_duration
-        let await_duration = if first_iteration {
-            first_iteration = false;
-            rand_duration(&await_duration)
-        } else {
-            await_duration
-        };
-
        // fixing the segno and waiting some time to prevent reuploading the same segment too often
        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
        let timeout = tokio::time::sleep(await_duration);
@@ -439,7 +454,7 @@ pub async fn main_task(
        }

        // limit concurrent uploads
-        let _upload_permit = limiter.acquire_partial_backup().await;
+        let _upload_permit = limiter.acquire_owned().await;

        let prepared = backup.prepare_upload().await;
        if let Some(seg) = &uploaded_segment {
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -181,6 +181,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        sk_auth_token: None,
        current_thread_runtime: false,
        walsenders_keep_horizon: false,
+        partial_backup_enabled: false,
        partial_backup_timeout: Duration::from_secs(0),
        disable_periodic_broker_push: false,
        enable_offload: false,
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -67,7 +67,6 @@ FALLBACK_DURATION = {
    "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
    "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -642,7 +642,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    logging::replace_panic_hook_with_tracing_panic_hook().forget();
    // initialize sentry if SENTRY_DSN is provided
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-    info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}");
+    info!("version: {GIT_VERSION}");
+    info!("build_tag: {BUILD_TAG}");
    metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // On any shutdown signal, log receival and exit.
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -18,7 +18,6 @@ anyhow.workspace = true
 aws-config.workspace = true
 bytes.workspace = true
 camino.workspace = true
-chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
@@ -32,7 +31,6 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
@@ -46,12 +44,7 @@ scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true

-diesel = { version = "2.1.4", features = [
-    "serde_json",
-    "postgres",
-    "r2d2",
-    "chrono",
-] }
+diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }

@@ -59,3 +52,4 @@ utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
@@ -1 +0,0 @@
-DROP TABLE metadata_health;
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
@@ -1,14 +0,0 @@
-CREATE TABLE metadata_health (
-  tenant_id VARCHAR NOT NULL,
-  shard_number INTEGER NOT NULL,
-  shard_count INTEGER NOT NULL,
-  PRIMARY KEY(tenant_id, shard_number, shard_count),
-  -- Rely on cascade behavior for delete
-  FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
-  healthy BOOLEAN NOT NULL DEFAULT TRUE,
-  last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
-);
-
-
-INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
-SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -10,11 +10,7 @@ use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
-use pageserver_api::controller_api::{
-    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
-    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    TenantCreateRequest,
-};
+use pageserver_api::controller_api::TenantCreateRequest;
 use pageserver_api::models::{
    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -564,51 +560,6 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Scrubber)?;
-
-    let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    state.service.metadata_health_update(update_req).await?;
-
-    json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
-}
-
-async fn handle_metadata_health_list_unhealthy(
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
-
-    json_response(
-        StatusCode::OK,
-        MetadataHealthListUnhealthyResponse {
-            unhealthy_tenant_shards,
-        },
-    )
-}
-
-async fn handle_metadata_health_list_outdated(
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
-    let state = get_state(&req);
-    let health_records = state
-        .service
-        .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
-        .await?;
-
-    json_response(
-        StatusCode::OK,
-        MetadataHealthListOutdatedResponse { health_records },
-    )
-}
-
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -1036,28 +987,6 @@ pub fn make_router(
                RequestName("control_v1_cancel_node_fill"),
            )
        })
-        // Metadata health operations
-        .post("/control/v1/metadata_health/update", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_update,
-                RequestName("control_v1_metadata_health_update"),
-            )
-        })
-        .get("/control/v1/metadata_health/unhealthy", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_list_unhealthy,
-                RequestName("control_v1_metadata_health_list_unhealthy"),
-            )
-        })
-        .post("/control/v1/metadata_health/outdated", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_list_outdated,
-                RequestName("control_v1_metadata_health_list_outdated"),
-            )
-        })
        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,14 +9,12 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
-use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
    RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

@@ -88,10 +86,6 @@ struct Cli {
    // TODO: make `cfg(feature = "testing")`
    #[arg(long)]
    neon_local_repo_dir: Option<PathBuf>,
-
-    /// Chaos testing
-    #[arg(long)]
-    chaos_interval: Option<humantime::Duration>,
 }

 enum StrictMode {
@@ -315,22 +309,6 @@ async fn async_main() -> anyhow::Result<()> {
    tracing::info!("Serving on {0}", args.listen);
    let server_task = tokio::task::spawn(server);

-    let chaos_task = args.chaos_interval.map(|interval| {
-        let service = service.clone();
-        let cancel = CancellationToken::new();
-        let cancel_bg = cancel.clone();
-        (
-            tokio::task::spawn(
-                async move {
-                    let mut chaos_injector = ChaosInjector::new(service, interval.into());
-                    chaos_injector.run(cancel_bg).await
-                }
-                .instrument(tracing::info_span!("chaos_injector")),
-            ),
-            cancel,
-        )
-    });
-
    // Wait until we receive a signal
    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
@@ -359,12 +337,6 @@ async fn async_main() -> anyhow::Result<()> {
        }
    }

-    // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
-    if let Some((chaos_jh, chaos_cancel)) = chaos_task {
-        chaos_cancel.cancel();
-        chaos_jh.await.ok();
-    }
-
    service.shutdown().await;
    tracing::info!("Service shutdown complete");

--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,7 +8,6 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
-use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
@@ -91,10 +90,6 @@ pub(crate) enum DatabaseOperation {
    UpdateTenantShard,
    DeleteTenant,
    UpdateTenantConfig,
-    UpdateMetadataHealth,
-    ListMetadataHealth,
-    ListMetadataHealthUnhealthy,
-    ListMetadataHealthOutdated,
 }

 #[must_use]
@@ -312,32 +307,15 @@ impl Persistence {
        &self,
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
-        use crate::schema::metadata_health;
-        use crate::schema::tenant_shards;
-
-        let now = chrono::Utc::now();
-
-        let metadata_health_records = shards
-            .iter()
-            .map(|t| MetadataHealthPersistence {
-                tenant_id: t.tenant_id.clone(),
-                shard_number: t.shard_number,
-                shard_count: t.shard_count,
-                healthy: true,
-                last_scrubbed_at: now,
-            })
-            .collect::<Vec<_>>();
-
+        use crate::schema::tenant_shards::dsl::*;
        self.with_measured_conn(
            DatabaseOperation::InsertTenantShards,
            move |conn| -> DatabaseResult<()> {
-                diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
-
-                diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
            },
        )
@@ -351,10 +329,10 @@ impl Persistence {
        self.with_measured_conn(
            DatabaseOperation::DeleteTenant,
            move |conn| -> DatabaseResult<()> {
-                // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                diesel::delete(tenant_shards)
                    .filter(tenant_id.eq(del_tenant_id.to_string()))
                    .execute(conn)?;
+
                Ok(())
            },
        )
@@ -697,94 +675,6 @@ impl Persistence {
        )
        .await
    }
-
-    /// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
-    ///
-    /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
-    #[allow(dead_code)]
-    pub(crate) async fn update_metadata_health_records(
-        &self,
-        healthy_records: Vec<MetadataHealthPersistence>,
-        unhealthy_records: Vec<MetadataHealthPersistence>,
-        now: chrono::DateTime<chrono::Utc>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::metadata_health::dsl::*;
-
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
-                    .on_conflict((tenant_id, shard_number, shard_count))
-                    .do_update()
-                    .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
-
-                diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
-                    .on_conflict((tenant_id, shard_number, shard_count))
-                    .do_update()
-                    .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
-                Ok(())
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records.
-    #[allow(dead_code)]
-    pub(crate) async fn list_metadata_health_records(
-        &self,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records that is unhealthy.
-    #[allow(dead_code)]
-    pub(crate) async fn list_unhealthy_metadata_health_records(
-        &self,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        use crate::schema::metadata_health::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records that have not been updated since an `earlier` time.
-    #[allow(dead_code)]
-    pub(crate) async fn list_outdated_metadata_health_records(
-        &self,
-        earlier: chrono::DateTime<chrono::Utc>,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        use crate::schema::metadata_health::dsl::*;
-
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
-                let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
-
-                Ok(res)
-            },
-        )
-        .await
-    }
 }

 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -854,59 +744,3 @@ pub(crate) struct NodePersistence {
    pub(crate) listen_pg_addr: String,
    pub(crate) listen_pg_port: i32,
 }
-
-/// Tenant metadata health status that are stored durably.
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
-#[diesel(table_name = crate::schema::metadata_health)]
-pub(crate) struct MetadataHealthPersistence {
-    #[serde(default)]
-    pub(crate) tenant_id: String,
-    #[serde(default)]
-    pub(crate) shard_number: i32,
-    #[serde(default)]
-    pub(crate) shard_count: i32,
-
-    pub(crate) healthy: bool,
-    pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-}
-
-impl MetadataHealthPersistence {
-    pub fn new(
-        tenant_shard_id: TenantShardId,
-        healthy: bool,
-        last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-    ) -> Self {
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_number = tenant_shard_id.shard_number.0 as i32;
-        let shard_count = tenant_shard_id.shard_count.literal() as i32;
-
-        MetadataHealthPersistence {
-            tenant_id,
-            shard_number,
-            shard_count,
-            healthy,
-            last_scrubbed_at,
-        }
-    }
-
-    #[allow(dead_code)]
-    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
-        Ok(TenantShardId {
-            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
-            shard_number: ShardNumber(self.shard_number as u8),
-            shard_count: ShardCount::new(self.shard_count as u8),
-        })
-    }
-}
-
-impl From<MetadataHealthPersistence> for MetadataHealthRecord {
-    fn from(value: MetadataHealthPersistence) -> Self {
-        MetadataHealthRecord {
-            tenant_shard_id: value
-                .get_tenant_shard_id()
-                .expect("stored tenant id should be valid"),
-            healthy: value.healthy,
-            last_scrubbed_at: value.last_scrubbed_at,
-        }
-    }
-}
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -656,8 +656,11 @@ impl Reconciler {
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.

-                    // Incrementing generation is the safe general case, but is inefficient for changes that only
-                    // modify some details (e.g. the tenant's config).
+                    // The general case is to increment the generation.  However, there are cases
+                    // where this is not necessary:
+                    // - if we are only updating the TenantConf part of the location
+                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
+                    //   and the location was already in the correct generation
                    let increment_generation = match observed {
                        None => true,
                        Some(ObservedStateLocation { conf: None }) => true,
@@ -666,11 +669,18 @@ impl Reconciler {
                        }) => {
                            let generations_match = observed.generation == wanted_conf.generation;

-                            // We may skip incrementing the generation if the location is already in the expected mode and
-                            // generation.  In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
-                            // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
-                            // after a restart/crash, so fall back to the universally safe path of incrementing generation.
-                            !generations_match || (observed.mode != wanted_conf.mode)
+                            use LocationConfigMode::*;
+                            let mode_transition_requires_gen_inc =
+                                match (observed.mode, wanted_conf.mode) {
+                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
+                                    (AttachedSingle, AttachedStale) => false,
+                                    (AttachedMulti, AttachedSingle) => false,
+                                    (lhs, rhs) => lhs != rhs,
+                                };
+
+                            !generations_match || mode_transition_requires_gen_inc
                        }
                    };

--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,15 +1,5 @@
 // @generated automatically by Diesel CLI.

-diesel::table! {
-    metadata_health (tenant_id, shard_number, shard_count) {
-        tenant_id -> Varchar,
-        shard_number -> Int4,
-        shard_count -> Int4,
-        healthy -> Bool,
-        last_scrubbed_at -> Timestamptz,
-    }
-}
-
 diesel::table! {
    nodes (node_id) {
        node_id -> Int8,
@@ -36,4 +26,4 @@ diesel::table! {
    }
 }

-diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -16,7 +16,7 @@ use crate::{
    compute_hook::NotifyError,
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
    metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
+    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
@@ -33,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
-        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -84,8 +84,6 @@ use crate::{
 };
 use serde::{Deserialize, Serialize};

-pub mod chaos_injector;
-
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);

@@ -6097,68 +6095,6 @@ impl Service {
        Ok(())
    }

-    /// Updates scrubber metadata health check results.
-    pub(crate) async fn metadata_health_update(
-        &self,
-        update_req: MetadataHealthUpdateRequest,
-    ) -> Result<(), ApiError> {
-        let now = chrono::offset::Utc::now();
-        let (healthy_records, unhealthy_records) = {
-            let locked = self.inner.read().unwrap();
-            let healthy_records = update_req
-                .healthy_tenant_shards
-                .into_iter()
-                // Retain only health records associated with tenant shards managed by storage controller.
-                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
-                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
-                .collect();
-            let unhealthy_records = update_req
-                .unhealthy_tenant_shards
-                .into_iter()
-                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
-                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
-                .collect();
-
-            (healthy_records, unhealthy_records)
-        };
-
-        self.persistence
-            .update_metadata_health_records(healthy_records, unhealthy_records, now)
-            .await?;
-        Ok(())
-    }
-
-    /// Lists the tenant shards that has unhealthy metadata status.
-    pub(crate) async fn metadata_health_list_unhealthy(
-        &self,
-    ) -> Result<Vec<TenantShardId>, ApiError> {
-        let result = self
-            .persistence
-            .list_unhealthy_metadata_health_records()
-            .await?
-            .iter()
-            .map(|p| p.get_tenant_shard_id().unwrap())
-            .collect();
-
-        Ok(result)
-    }
-
-    /// Lists the tenant shards that have not been scrubbed for some duration.
-    pub(crate) async fn metadata_health_list_outdated(
-        &self,
-        not_scrubbed_for: Duration,
-    ) -> Result<Vec<MetadataHealthRecord>, ApiError> {
-        let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
-        let result = self
-            .persistence
-            .list_outdated_metadata_health_records(earlier)
-            .await?
-            .into_iter()
-            .map(|record| record.into())
-            .collect();
-        Ok(result)
-    }
-
    pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
        self.inner.read().unwrap().get_leadership_status()
    }
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,71 +0,0 @@
-use std::{sync::Arc, time::Duration};
-
-use rand::seq::SliceRandom;
-use rand::thread_rng;
-use tokio_util::sync::CancellationToken;
-
-use super::Service;
-
-pub struct ChaosInjector {
-    service: Arc<Service>,
-    interval: Duration,
-}
-
-impl ChaosInjector {
-    pub fn new(service: Arc<Service>, interval: Duration) -> Self {
-        Self { service, interval }
-    }
-
-    pub async fn run(&mut self, cancel: CancellationToken) {
-        let mut interval = tokio::time::interval(self.interval);
-
-        loop {
-            tokio::select! {
-                _ = interval.tick() => {}
-                _ = cancel.cancelled() => {
-                    tracing::info!("Shutting down");
-                    return;
-                }
-            }
-
-            self.inject_chaos().await;
-
-            tracing::info!("Chaos iteration...");
-        }
-    }
-
-    async fn inject_chaos(&mut self) {
-        // Pick some shards to interfere with
-        let batch_size = 128;
-        let mut inner = self.service.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = inner.parts_mut();
-        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
-        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
-
-        for victim in victims {
-            let shard = tenants
-                .get_mut(victim)
-                .expect("Held lock between choosing ID and this get");
-
-            // Pick a secondary to promote
-            let Some(new_location) = shard
-                .intent
-                .get_secondary()
-                .choose(&mut thread_rng())
-                .cloned()
-            else {
-                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
-                continue;
-            };
-
-            let Some(old_location) = *shard.intent.get_attached() else {
-                tracing::info!("Skipping shard {victim}: currently has no attached location");
-                continue;
-            };
-
-            shard.intent.demote_attached(scheduler, old_location);
-            shard.intent.promote_attached(scheduler, new_location);
-            self.service.maybe_reconcile_shard(shard, nodes);
-        }
-    }
-}
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -10,7 +10,6 @@ aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
-git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -40,11 +40,6 @@ impl TimelineAnalysis {
            garbage_keys: Vec::new(),
        }
    }
-
-    /// Whether a timeline is healthy.
-    pub(crate) fn is_healthy(&self) -> bool {
-        self.errors.is_empty() && self.warnings.is_empty()
-    }
 }

 pub(crate) async fn branch_cleanup_and_check_errors(
@@ -92,7 +87,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                            .push(format!("index_part.json version: {}", index_part.version()))
                    }

-                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
+                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
                    if !newest_versions.any(|ip| ip == &index_part.version()) {
                        info!(
                            "index_part.json version is not latest: {}",
@@ -172,11 +167,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                    }
                }
                BlobDataParseResult::Relic => {}
-                BlobDataParseResult::Incorrect {
-                    errors,
-                    s3_layers: _,
-                } => result.errors.extend(
-                    errors
+                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
+                    parse_errors
                        .into_iter()
                        .map(|error| format!("parse error: {error}")),
                ),
@@ -303,10 +295,7 @@ pub(crate) enum BlobDataParseResult {
    },
    /// The remains of a deleted Timeline (i.e. an initdb archive only)
    Relic,
-    Incorrect {
-        errors: Vec<String>,
-        s3_layers: HashSet<(LayerName, Generation)>,
-    },
+    Incorrect(Vec<String>),
 }

 pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
@@ -449,7 +438,7 @@ pub(crate) async fn list_timeline_blobs(
    }

    Ok(S3TimelineBlobData {
-        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
+        blob_data: BlobDataParseResult::Incorrect(errors),
        unused_index_keys: index_part_keys,
        unknown_keys,
    })
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,13 +1,10 @@
-use std::pin::pin;
-
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
-use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};

 use crate::{
-    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
-    stream_objects_with_retries, BucketConfig, NodeKind,
+    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
+    metadata_stream::stream_tenants, BucketConfig, NodeKind,
 };

 #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -50,38 +47,45 @@ pub async fn find_large_objects(
    ignore_deltas: bool,
    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));

    let objects_stream = tenants.map_ok(|tenant_shard_id| {
        let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        let remote_client = remote_client.clone();
+        let s3_client = s3_client.clone();
        async move {
            let mut objects = Vec::new();
            let mut total_objects_ctr = 0u64;
            // We want the objects and not just common prefixes
            tenant_root.delimiter.clear();
-            let mut objects_stream = pin!(stream_objects_with_retries(
-                &remote_client,
-                ListingMode::NoDelimiter,
-                &tenant_root
-            ));
-            while let Some(listing) = objects_stream.next().await {
-                let listing = listing?;
-                for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
-                    let key = obj.key.to_string();
+            let mut continuation_token = None;
+            loop {
+                let fetch_response =
+                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                        .await?;
+                for obj in fetch_response.contents().iter().filter(|o| {
+                    if let Some(obj_size) = o.size {
+                        min_size as i64 <= obj_size
+                    } else {
+                        false
+                    }
+                }) {
+                    let key = obj.key().expect("couldn't get key").to_owned();
                    let kind = LargeObjectKind::from_key(&key);
                    if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
                        continue;
                    }
                    objects.push(LargeObject {
                        key,
-                        size: obj.size,
+                        size: obj.size.unwrap() as u64,
                        kind,
                    })
                }
-                total_objects_ctr += listing.keys.len() as u64;
+                total_objects_ctr += fetch_response.contents().len() as u64;
+                match fetch_response.next_continuation_token {
+                    Some(new_token) => continuation_token = Some(new_token),
+                    None => break,
+                }
            }

            Ok((tenant_shard_id, objects, total_objects_ctr))
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -5,7 +5,6 @@
 use std::{
    collections::{HashMap, HashSet},
    sync::Arc,
-    time::Duration,
 };

 use anyhow::Context;
@@ -19,8 +18,8 @@ use utils::id::TenantId;

 use crate::{
    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote_generic, list_objects_with_retries_generic,
-    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
+    init_remote, init_remote_generic,
+    metadata_stream::{stream_tenant_timelines, stream_tenants},
    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };

@@ -28,11 +27,6 @@ use crate::{
 enum GarbageReason {
    DeletedInConsole,
    MissingInConsole,
-
-    // The remaining data relates to a known deletion issue, and we're sure that purging this
-    // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
-    // there is nothing in a tenant path apart from a heatmap file.
-    KnownBug,
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -78,15 +72,6 @@ impl GarbageList {
        }
    }

-    /// If an entity has been identified as requiring purge due to a known bug, e.g.
-    /// a particular type of object left behind after an incomplete deletion.
-    fn append_buggy(&mut self, entity: GarbageEntity) {
-        self.items.push(GarbageItem {
-            entity,
-            reason: GarbageReason::KnownBug,
-        });
-    }
-
    /// Return true if appended, false if not.  False means the result was not garbage.
    fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
    where
@@ -153,7 +138,7 @@ async fn find_garbage_inner(
    node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
    // Construct clients for S3 and for Console API
-    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));

    // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +164,7 @@ async fn find_garbage_inner(

    // Enumerate Tenants in S3, and check if each one exists in Console
    tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants_generic(&remote_client, &target);
+    let tenants = stream_tenants(&s3_client, &target);
    let tenants_checked = tenants.map_ok(|t| {
        let api_client = cloud_admin_api_client.clone();
        let console_cache = console_cache.clone();
@@ -234,66 +219,6 @@ async fn find_garbage_inner(
            assert!(project.tenant == tenant_shard_id.tenant_id);
        }

-        // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
-        // identify it as purge-able anyway
-        if console_result.is_none() {
-            let timelines =
-                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await;
-            if timelines.is_empty() {
-                // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries_generic(
-                    &remote_client,
-                    ListingMode::WithDelimiter,
-                    &target.tenant_root(&tenant_shard_id),
-                )
-                .await?;
-                let object = tenant_objects.keys.first().unwrap();
-                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
-                } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
-                }
-            } else {
-                // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
-                // rollout of WAL DR in which we never deleted these.
-                let mut any_non_initdb = false;
-
-                for timeline_r in timelines {
-                    let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries_generic(
-                        &remote_client,
-                        ListingMode::WithDelimiter,
-                        &target.timeline_root(&timeline),
-                    )
-                    .await?;
-                    if !timeline_objects.prefixes.is_empty() {
-                        // Sub-paths?  Unexpected
-                        any_non_initdb = true;
-                    } else {
-                        let object = timeline_objects.keys.first().unwrap();
-                        if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
-                            tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
-                        } else {
-                            any_non_initdb = true;
-                        }
-                    }
-                }
-
-                if any_non_initdb {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
-                } else {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
-                }
-            }
-        }
-
        if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
            tracing::debug!("Tenant {tenant_shard_id} is garbage");
        } else {
@@ -331,8 +256,7 @@ async fn find_garbage_inner(

    // Construct a stream of all timelines within active tenants
    let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines =
-        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
+    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
    let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
    let timelines = timelines.try_flatten();

@@ -425,6 +349,9 @@ pub async fn get_timeline_objects(
    tracing::debug!("Listing objects in timeline {ttid}");
    let timeline_root = super::remote_timeline_path_id(&ttid);

+    // TODO: apply extra validation based on object modification time.  Don't purge
+    // timelines whose index_part.json has been touched recently.
+
    let list = s3_client
        .list(
            Some(&timeline_root),
@@ -495,7 +422,6 @@ impl DeletionProgressTracker {
 pub async fn purge_garbage(
    input_path: String,
    mode: PurgeMode,
-    min_age: Duration,
    dry_run: bool,
 ) -> anyhow::Result<()> {
    let list_bytes = tokio::fs::read(&input_path).await?;
@@ -506,7 +432,7 @@ pub async fn purge_garbage(
        input_path
    );

-    let (remote_client, _target) =
+    let remote_client =
        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;

    assert_eq!(
@@ -533,7 +459,6 @@ pub async fn purge_garbage(
        .filter(|i| match (&mode, &i.reason) {
            (PurgeMode::DeletedAndMissing, _) => true,
            (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
-            (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
            (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
        });

@@ -562,37 +487,6 @@ pub async fn purge_garbage(
    let mut progress_tracker = DeletionProgressTracker::default();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
-
-        // Extra safety check: even if a collection of objects is garbage, check max() of modification
-        // times before purging, so that if we incorrectly marked a live tenant as garbage then we would
-        // notice that its index has been written recently and would omit deleting it.
-        if object_list.is_empty() {
-            // Simplify subsequent code by ensuring list always has at least one item
-            // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
-            continue;
-        }
-        let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
-        let age = max_mtime.elapsed();
-        match age {
-            Err(_) => {
-                tracing::warn!("Bad last_modified time");
-                continue;
-            }
-            Ok(a) if a < min_age => {
-                // Failed age check.  This doesn't mean we did something wrong: a tenant might really be garbage and recently
-                // written, but out of an abundance of caution we still don't purge it.
-                tracing::info!(
-                    "Skipping tenant with young objects {}..{}",
-                    object_list.first().as_ref().unwrap().key,
-                    object_list.last().as_ref().unwrap().key
-                );
-                continue;
-            }
-            Ok(_) => {
-                // Passed age check
-            }
-        }
-
        objects_to_delete.append(&mut object_list);
        if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
            do_delete(
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -16,26 +16,22 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, Context};
-use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
 use aws_sdk_s3::Client;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
-use futures::{Stream, StreamExt};
 use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{
-    GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
-use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -257,12 +253,6 @@ pub struct ControllerClientConfig {
    pub controller_jwt: String,
 }

-impl ControllerClientConfig {
-    pub fn build_client(self) -> control_api::Client {
-        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
-    }
-}
-
 pub struct ConsoleConfig {
    pub token: String,
    pub base_url: Url,
@@ -315,15 +305,8 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
 }

 async fn init_s3_client(bucket_region: Region) -> Client {
-    let mut retry_config_builder = RetryConfigBuilder::new();
-
-    retry_config_builder
-        .set_max_attempts(Some(3))
-        .set_mode(Some(RetryMode::Adaptive));
-
    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
        .region(bucket_region)
-        .retry_config(retry_config_builder.build())
        .load()
        .await;
    Client::new(&config)
@@ -336,35 +319,27 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
    }
 }

-fn make_root_target(
-    bucket_name: String,
-    prefix_in_bucket: String,
-    node_kind: NodeKind,
-) -> RootTarget {
-    let s3_target = S3Target {
-        bucket_name,
-        prefix_in_bucket,
-        delimiter: "/".to_string(),
-    };
-    match node_kind {
-        NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
-        NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
-    }
-}
-
 async fn init_remote(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
    let bucket_region = Region::new(bucket_config.region);
+    let delimiter = "/".to_string();
    let s3_client = Arc::new(init_s3_client(bucket_region).await);
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();

-    let s3_root = make_root_target(
-        bucket_config.bucket,
-        bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-        node_kind,
-    );
+    let s3_root = match node_kind {
+        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            delimiter,
+        }),
+        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            delimiter,
+        }),
+    };

    Ok((s3_client, s3_root))
 }
@@ -372,12 +347,12 @@ async fn init_remote(
 async fn init_remote_generic(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
-) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
+) -> anyhow::Result<GenericRemoteStorage> {
    let endpoint = env::var("AWS_ENDPOINT_URL").ok();
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
    let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
    let storage = S3Config {
-        bucket_name: bucket_config.bucket.clone(),
+        bucket_name: bucket_config.bucket,
        bucket_region: bucket_config.region,
        prefix_in_bucket,
        endpoint,
@@ -391,13 +366,7 @@ async fn init_remote_generic(
        storage: RemoteStorageKind::AwsS3(storage),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
-
-    // We already pass the prefix to the remote client above
-    let prefix_in_root_target = String::new();
-    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
-
-    let client = GenericRemoteStorage::from_config(&storage_config).await?;
-    Ok((client, s3_root))
+    GenericRemoteStorage::from_config(&storage_config).await
 }

 async fn list_objects_with_retries(
@@ -435,84 +404,6 @@ async fn list_objects_with_retries(
    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }

-/// Listing possibly large amounts of keys in a streaming fashion.
-fn stream_objects_with_retries<'a>(
-    storage_client: &'a GenericRemoteStorage,
-    listing_mode: ListingMode,
-    s3_target: &'a S3Target,
-) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
-    async_stream::stream! {
-        let mut trial = 0;
-        let cancel = CancellationToken::new();
-        let prefix_str = &s3_target
-            .prefix_in_bucket
-            .strip_prefix("/")
-            .unwrap_or(&s3_target.prefix_in_bucket);
-        let prefix = RemotePath::from_string(prefix_str)?;
-        let mut list_stream =
-            storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
-        while let Some(res) = list_stream.next().await {
-            if let Err(err) = res {
-                let yield_err = if err.is_permanent() {
-                    true
-                } else {
-                    let backoff_time = 1 << trial.max(5);
-                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-                    trial += 1;
-                    trial == MAX_RETRIES - 1
-                };
-                if yield_err {
-                    yield Err(err)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                    break;
-                }
-            } else {
-                trial = 0;
-                yield res.map_err(anyhow::Error::from);
-            }
-        }
-    }
-}
-
-/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
-/// use [`stream_objects_with_retries`] instead.
-async fn list_objects_with_retries_generic(
-    remote_client: &GenericRemoteStorage,
-    listing_mode: ListingMode,
-    s3_target: &S3Target,
-) -> anyhow::Result<Listing> {
-    let cancel = CancellationToken::new();
-    let prefix_str = &s3_target
-        .prefix_in_bucket
-        .strip_prefix("/")
-        .unwrap_or(&s3_target.prefix_in_bucket);
-    let prefix = RemotePath::from_string(prefix_str)?;
-    for trial in 0..MAX_RETRIES {
-        match remote_client
-            .list(Some(&prefix), listing_mode, None, &cancel)
-            .await
-        {
-            Ok(response) => return Ok(response),
-            Err(e) => {
-                if trial == MAX_RETRIES - 1 {
-                    return Err(e)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                }
-                error!(
-                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
-                    s3_target.bucket_name,
-                    s3_target.prefix_in_bucket,
-                    s3_target.delimiter,
-                    DisplayErrorContext(e),
-                );
-                let backoff_time = 1 << trial.max(5);
-                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-            }
-        }
-    }
-    panic!("MAX_RETRIES is not allowed to be 0");
-}
-
 async fn download_object_with_retries(
    s3_client: &Client,
    bucket_name: &str,
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,8 +1,7 @@
 use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
-use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
-use reqwest::{Method, Url};
+use reqwest::Url;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -17,11 +16,6 @@ use storage_scrubber::{
 use clap::{Parser, Subcommand};
 use utils::id::TenantId;

-use utils::{project_build_tag, project_git_version};
-
-project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -56,8 +50,6 @@ enum Command {
        input_path: String,
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
-        #[arg(long = "min-age")]
-        min_age: humantime::Duration,
    },
    #[command(verbatim_doc_comment)]
    ScanMetadata {
@@ -67,8 +59,6 @@ enum Command {
        json: bool,
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
-        #[arg(long = "post", default_value_t = false)]
-        post_to_storage_controller: bool,
        #[arg(long, default_value = None)]
        /// For safekeeper node_kind only, points to db with debug dump
        dump_db_connstr: Option<String>,
@@ -106,8 +96,6 @@ enum Command {
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();

-    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
-
    let bucket_config = BucketConfig::from_env()?;

    let command_log_name = match &cli.command {
@@ -126,20 +114,11 @@ async fn main() -> anyhow::Result<()> {
        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
    ));

-    let controller_client_conf = cli.controller_api.map(|controller_api| {
-        ControllerClientConfig {
-            controller_api,
-            // Default to no key: this is a convenience when working in a development environment
-            controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
-        }
-    });
-
    match cli.command {
        Command::ScanMetadata {
            json,
            tenant_ids,
            node_kind,
-            post_to_storage_controller,
            dump_db_connstr,
            dump_db_table,
        } => {
@@ -178,9 +157,6 @@ async fn main() -> anyhow::Result<()> {
                }
                Ok(())
            } else {
-                if controller_client_conf.is_none() && post_to_storage_controller {
-                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
-                }
                match scan_metadata(bucket_config.clone(), tenant_ids).await {
                    Err(e) => {
                        tracing::error!("Failed: {e}");
@@ -192,37 +168,22 @@ async fn main() -> anyhow::Result<()> {
                        } else {
                            println!("{}", summary.summary_string());
                        }
-
-                        if post_to_storage_controller {
-                            if let Some(conf) = controller_client_conf {
-                                let controller_client = conf.build_client();
-                                let body = summary.build_health_update_request();
-                                controller_client
-                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
-                                        Method::POST,
-                                        "control/v1/metadata_health/update".to_string(),
-                                        Some(body),
-                                    )
-                                    .await?;
-                            }
-                        }
-
                        if summary.is_fatal() {
-                            tracing::error!("Fatal scrub errors detected");
+                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
                        } else if summary.is_empty() {
                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                            // scrubber they were likely expecting to scan something, and if we see no timelines
                            // at all then it's likely due to some configuration issues like a bad prefix
-                            tracing::error!(
+                            Err(anyhow::anyhow!(
                                "No timelines found in bucket {} prefix {}",
                                bucket_config.bucket,
                                bucket_config
                                    .prefix_in_bucket
                                    .unwrap_or("<none>".to_string())
-                            );
+                            ))
+                        } else {
+                            Ok(())
                        }
-
-                        Ok(())
                    }
                }
            }
@@ -235,11 +196,9 @@ async fn main() -> anyhow::Result<()> {
            let console_config = ConsoleConfig::from_env()?;
            find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
        }
-        Command::PurgeGarbage {
-            input_path,
-            mode,
-            min_age,
-        } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
+        Command::PurgeGarbage { input_path, mode } => {
+            purge_garbage(input_path, mode, !cli.delete).await
+        }
        Command::TenantSnapshot {
            tenant_id,
            output_path,
@@ -254,6 +213,14 @@ async fn main() -> anyhow::Result<()> {
            min_age,
            mode,
        } => {
+            let controller_client_conf = cli.controller_api.map(|controller_api| {
+                ControllerClientConfig {
+                    controller_api,
+                    // Default to no key: this is a convenience when working in a development environment
+                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+                }
+            });
+
            match (&controller_client_conf, mode) {
                (Some(_), _) => {
                    // Any mode may run when controller API is set
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -1,41 +1,12 @@
-use std::str::FromStr;
-
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
-use futures::StreamExt;
-use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;

-use crate::{
-    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
-    TenantShardTimelineId,
-};
+use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};

-/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
-pub fn stream_tenants_generic<'a>(
-    remote_client: &'a GenericRemoteStorage,
-    target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
-    try_stream! {
-        let tenants_target = target.tenants_root();
-        let mut tenants_stream =
-            std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
-        while let Some(chunk) = tenants_stream.next().await {
-            let chunk = chunk?;
-            let entry_ids = chunk.prefixes.iter()
-                .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
-            for dir_name_res in entry_ids {
-                let dir_name = dir_name_res?;
-                let id = TenantShardId::from_str(dir_name)?;
-                yield id;
-            }
-        }
-    }
-}
-
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
    s3_client: &'a Client,
@@ -189,63 +160,6 @@ pub async fn stream_tenant_timelines<'a>(
    })
 }

-/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
-/// using a listing. The listing is done before the stream is built, so that this
-/// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines_generic<'a>(
-    remote_client: &'a GenericRemoteStorage,
-    target: &'a RootTarget,
-    tenant: TenantShardId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
-    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
-    let timelines_target = target.timelines_root(&tenant);
-
-    let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
-        remote_client,
-        ListingMode::WithDelimiter,
-        &timelines_target
-    ));
-    loop {
-        tracing::debug!("Listing in {tenant}");
-        let fetch_response = match objects_stream.next().await {
-            None => break,
-            Some(Err(e)) => {
-                timeline_ids.push(Err(e));
-                break;
-            }
-            Some(Ok(r)) => r,
-        };
-
-        let new_entry_ids = fetch_response
-            .prefixes
-            .iter()
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .get_path()
-                    .as_str()
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                entry_id_str
-                    .parse::<TimelineId>()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-        for i in new_entry_ids {
-            timeline_ids.push(i);
-        }
-    }
-
-    tracing::debug!("Yielding for {}", tenant);
-    Ok(stream! {
-        for i in timeline_ids {
-            let id = i?;
-            yield Ok(TenantShardTimelineId::new(tenant, id));
-        }
-    })
-}
-
 pub(crate) fn stream_listing<'a>(
    s3_client: &'a Client,
    target: &'a S3Target,
@@ -276,33 +190,3 @@ pub(crate) fn stream_listing<'a>(
        }
    }
 }
-
-pub(crate) fn stream_listing_generic<'a>(
-    remote_client: &'a GenericRemoteStorage,
-    target: &'a S3Target,
-) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
-    let listing_mode = if target.delimiter.is_empty() {
-        ListingMode::NoDelimiter
-    } else {
-        ListingMode::WithDelimiter
-    };
-    try_stream! {
-        let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
-            remote_client,
-            listing_mode,
-            target,
-        ));
-        while let Some(list) = objects_stream.next().await {
-            let list = list?;
-            if target.delimiter.is_empty() {
-                for key in list.keys {
-                    yield (key.key.clone(), Some(key));
-                }
-            } else {
-                for key in list.prefixes {
-                    yield (key, None);
-                }
-            }
-        }
-    }
-}
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -389,13 +389,10 @@ async fn gc_ancestor(
                // Post-deletion tenant location: don't try and GC it.
                continue;
            }
-            BlobDataParseResult::Incorrect {
-                errors,
-                s3_layers: _, // TODO(yuchen): could still check references to these s3 layers?
-            } => {
+            BlobDataParseResult::Incorrect(reasons) => {
                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
                tracing::warn!(
-                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}"
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
                );
                continue;
            }
@@ -521,12 +518,9 @@ pub async fn pageserver_physical_gc(
                // Post-deletion tenant location: don't try and GC it.
                return Ok(summary);
            }
-            BlobDataParseResult::Incorrect {
-                errors,
-                s3_layers: _,
-            } => {
+            BlobDataParseResult::Incorrect(reasons) => {
                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
+                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
                return Ok(summary);
            }
        };
@@ -573,7 +567,13 @@ pub async fn pageserver_physical_gc(
    }

    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
+    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
+        let ControllerClientConfig {
+            controller_api,
+            controller_jwt,
+        } = c;
+        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
+    }) else {
        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
        return Ok(summary);
    };
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -9,13 +9,12 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
-use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;

-#[derive(Serialize, Default)]
+#[derive(Serialize)]
 pub struct MetadataSummary {
    tenant_count: usize,
    timeline_count: usize,
@@ -24,16 +23,19 @@ pub struct MetadataSummary {
    with_warnings: HashSet<TenantShardTimelineId>,
    with_orphans: HashSet<TenantShardTimelineId>,
    indices_by_version: HashMap<usize, usize>,
-
-    #[serde(skip)]
-    pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
-    #[serde(skip)]
-    pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
 }

 impl MetadataSummary {
    fn new() -> Self {
-        Self::default()
+        Self {
+            tenant_count: 0,
+            timeline_count: 0,
+            timeline_shard_count: 0,
+            with_errors: HashSet::new(),
+            with_warnings: HashSet::new(),
+            with_orphans: HashSet::new(),
+            indices_by_version: HashMap::new(),
+        }
    }

    fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -52,13 +54,6 @@ impl MetadataSummary {
    }

    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
-        if analysis.is_healthy() {
-            self.healthy_tenant_shards.insert(id.tenant_shard_id);
-        } else {
-            self.healthy_tenant_shards.remove(&id.tenant_shard_id);
-            self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
-        }
-
        if !analysis.errors.is_empty() {
            self.with_errors.insert(*id);
        }
@@ -106,13 +101,6 @@ Index versions: {version_summary}
    pub fn is_empty(&self) -> bool {
        self.timeline_shard_count == 0
    }
-
-    pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
-        MetadataHealthUpdateRequest {
-            healthy_tenant_shards: self.healthy_tenant_shards.clone(),
-            unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
-        }
-    }
 }

 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
@@ -290,21 +278,13 @@ pub async fn scan_metadata(
            }
        }

-        match &data.blob_data {
-            BlobDataParseResult::Parsed {
-                index_part: _index_part,
-                index_part_generation: _index_part_generation,
-                s3_layers,
-            } => {
-                tenant_objects.push(ttid, s3_layers.clone());
-            }
-            BlobDataParseResult::Relic => (),
-            BlobDataParseResult::Incorrect {
-                errors: _,
-                s3_layers,
-            } => {
-                tenant_objects.push(ttid, s3_layers.clone());
-            }
+        if let BlobDataParseResult::Parsed {
+            index_part: _index_part,
+            index_part_generation: _index_part_generation,
+            s3_layers,
+        } = &data.blob_data
+        {
+            tenant_objects.push(ttid, s3_layers.clone());
        }
        tenant_timeline_results.push((ttid, data));
    }
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,10 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};

+use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
-use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{error, info, trace};
@@ -14,9 +14,8 @@ use utils::{
 };

 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
-    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
-    TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
 };

 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -107,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
    let timelines = client.query(&query, &[]).await?;
    info!("loaded {} timelines", timelines.len());

-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
    let console_config = ConsoleConfig::from_env()?;
    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);

@@ -120,7 +119,7 @@ pub async fn scan_safekeeper_metadata(
        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
        check_timeline(
-            &remote_client,
+            &s3_client,
            &target,
            &cloud_admin_api_client,
            ttid,
@@ -157,7 +156,7 @@ struct TimelineCheckResult {
 /// errors are logged to stderr; returns Ok(true) if timeline is consistent,
 /// Ok(false) if not, Err if failed to check.
 async fn check_timeline(
-    remote_client: &GenericRemoteStorage,
+    s3_client: &Client,
    root: &RootTarget,
    api_client: &CloudAdminApiClient,
    ttid: TenantTimelineId,
@@ -188,13 +187,12 @@ async fn check_timeline(
    // we need files, so unset it.
    timeline_dir_target.delimiter = String::new();

-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
    while let Some(obj) = stream.next().await {
-        let (key, _obj) = obj?;
+        let obj = obj?;
+        let key = obj.key();

        let seg_name = key
-            .get_path()
-            .as_str()
            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
            .expect("failed to extract segment name");
        expected_segfiles.remove(seg_name);
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -269,7 +269,7 @@ impl SnapshotDownloader {
                        .context("Downloading timeline")?;
                    }
                    BlobDataParseResult::Relic => {}
-                    BlobDataParseResult::Incorrect { .. } => {
+                    BlobDataParseResult::Incorrect(_) => {
                        tracing::error!("Bad metadata in timeline {ttid}");
                    }
                };
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	ef7e96fb4e	tweak comments	2024-07-29 11:41:44 +01:00
Conrad Ludgate	54c5196f75	proxy: improve performance of leaky-bucket	2024-07-28 23:00:21 +01:00