Merge pull request #8598 from neondatabase/rc/2024-08-05

Storage & Compute release 2024-08-05
feat(pageserver): support auto split layers based on size (#8574 )
2026-02-11 22:50:37 +00:00 · 2024-08-05 14:21:20 +02:00 · 2024-08-05 08:56:00 +02:00 · 2024-08-05 08:55:59 +02:00 · 2024-08-05 08:55:59 +02:00 · 2024-08-05 08:55:59 +02:00
90 changed files with 5797 additions and 1246 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,11 +14,8 @@ inputs:
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
  provisioner:
    description: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
  compute_units:
-    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units'
    default: '[1, 1]'
 outputs:
@@ -37,10 +34,6 @@ runs:
      # A shell without `set -x` to not to expose password/dsn in logs
      shell: bash -euo pipefail {0}
      run: |
        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
        fi
        project=$(curl \
          "https://${API_HOST}/api/v2/projects" \
          --fail \
@@ -52,7 +45,7 @@ runs:
              \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
              \"pg_version\": ${POSTGRES_VERSION},
              \"region_id\": \"${REGION_ID}\",
-              \"provisioner\": \"${PROVISIONER}\",
+              \"provisioner\": \"k8s-neonvm\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
              \"settings\": { }
@@ -75,6 +68,5 @@ runs:
        API_KEY: ${{ inputs.api_key }}
        REGION_ID: ${{ inputs.region_id }}
        POSTGRES_VERSION: ${{ inputs.postgres_version }}
        PROVISIONER: ${{ inputs.provisioner }}
        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,6 +19,10 @@ on:
        description: 'debug or release'
        required: true
        type: string
      pg-versions:
        description: 'a json array of postgres versions to run regression tests on'
        required: true
        type: string
 defaults:
  run:
@@ -254,7 +258,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pg_version: [ v14, v15, v16 ]
+        pg_version: ${{ fromJson(inputs.pg-versions) }}
    steps:
      - uses: actions/checkout@v4
        with:
@@ -284,5 +288,5 @@ jobs:
      - name: Merge and upload coverage data
        if: |
          false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
+          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,11 +63,9 @@ jobs:
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
            provisioner: 'k8s-pod' 
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -100,7 +98,6 @@ jobs:
        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        provisioner: ${{ matrix.provisioner }}
    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -216,11 +213,11 @@ jobs:
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
-    # - neon-captest-new: Freshly created project (1 CU)
+    # - neonvm-captest-new: Freshly created project (1 CU)
-    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
-    # - neon-captest-reuse: Reusing existing project
+    # - neonvm-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
@@ -245,18 +242,16 @@ jobs:
            "'"$region_id_default"'"
            ],
          "platform": [
-            "neon-captest-new",
+            "neonvm-captest-new",
-            "neon-captest-reuse",
+            "neonvm-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'
@@ -271,7 +266,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neon-captest-reuse"
+            "neonvm-captest-reuse"
          ]
        }'
@@ -287,7 +282,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neon-captest-reuse"
+            "neonvm-captest-reuse"
          ],
          "scale": [
            "10"
@@ -338,7 +333,7 @@ jobs:
        prefix: latest
    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
@@ -346,19 +341,18 @@ jobs:
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
    - name: Set up Connection String
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -442,9 +436,9 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "neonvm-captest-pgvector"
          - PLATFORM: "azure-captest-pgvector"
-            
+
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -486,7 +480,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neon-captest-pgvector)
+          neonvm-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
            ;;
          azure-captest-pgvector)
@@ -585,7 +579,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
            ;;
          rds-aurora)
@@ -595,7 +589,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -672,7 +666,7 @@ jobs:
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
@@ -682,7 +676,7 @@ jobs:
            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -759,7 +753,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
            ;;
          rds-aurora)
@@ -769,7 +763,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,7 +203,8 @@ jobs:
      fail-fast: false
      matrix:
        arch: [ x64 ]
-        build-type: [ debug, release ]
+        # Do not build or run tests in debug for release branches
        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
        include:
          - build-type: release
            arch: arm64
@@ -213,6 +214,8 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
    secrets: inherit
  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -306,7 +309,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -833,6 +836,9 @@ jobs:
          rm -rf .docker-custom
  promote-images:
    permissions:
      contents: read  # This is required for actions/checkout
      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04
@@ -859,6 +865,28 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done
      - name: Azure login
        if: github.ref_name == 'main'
        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
        with:
          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
      - name: Login to ACR
        if: github.ref_name == 'main'
        run: |
          az acr login --name=neoneastus2
      - name: Copy docker images to ACR-dev
        if: github.ref_name == 'main'
        run: |
          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
            docker buildx imagetools create \
              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
          done
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -13,6 +13,7 @@ on:
    paths:
      - '.github/workflows/pg-clients.yml'
      - 'test_runner/pg_clients/**'
      - 'test_runner/logical_repl/**'
      - 'poetry.lock'
  workflow_dispatch:
@@ -49,6 +50,77 @@ jobs:
      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit
  test-logical-replication:
    needs: [ build-build-tools-image ]
    runs-on: ubuntu-22.04
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init --user root
    services:
      clickhouse:
        image: clickhouse/clickhouse-server:24.6.3.64
        ports:
          - 9000:9000
          - 8123:8123
    steps:
      - uses: actions/checkout@v4
      - name: Download Neon artifact
        uses: ./.github/actions/download
        with:
          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
          path: /tmp/neon/
          prefix: latest
      - name: Create Neon Project
        id: create-neon-project
        uses: ./.github/actions/neon-project-create
        with:
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
      - name: Run tests
        uses: ./.github/actions/run-python-test-set
        with:
          build_type: remote
          test_selection: logical_repl
          run_in_parallel: false
          extra_params: -m remote_cluster
          pg_version: ${{ env.DEFAULT_PG_VERSION }}
        env:
          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
      - name: Delete Neon Project
        if: always()
        uses: ./.github/actions/neon-project-delete
        with:
          project_id: ${{ steps.create-neon-project.outputs.project_id }}
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
      - name: Create Allure report
        if: ${{ !cancelled() }}
        id: create-allure-report
        uses: ./.github/actions/allure-report-generate
        with:
          store-test-results-into-db: true
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
      - name: Post to a Slack channel
        if: github.event.schedule && failure()
        uses: slackapi/slack-github-action@v1
        with:
          channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
          slack-message: |
            Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
        env:
          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  test-postgres-client-libs:
    needs: [ build-build-tools-image ]
    runs-on: ubuntu-22.04
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1418,7 +1418,7 @@ dependencies = [
 "clap",
 "criterion-plot",
 "is-terminal",
- "itertools",
+ "itertools 0.10.5",
 "num-traits",
 "once_cell",
 "oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
 "cast",
- "itertools",
+ "itertools 0.10.5",
 ]
 [[package]]
@@ -1672,6 +1672,7 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
 "chrono",
 "diesel_derives",
 "itoa",
 "pq-sys",
@@ -2133,6 +2134,12 @@ dependencies = [
 "slab",
 ]
 [[package]]
 name = "gen_ops"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2709,17 +2716,6 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
 [[package]]
 name = "io-lifetimes"
 version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
 "hermit-abi",
 "libc",
 "windows-sys 0.48.0",
 ]
 [[package]]
 name = "io-uring"
 version = "0.6.2"
@@ -2738,14 +2734,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
 [[package]]
 name = "is-terminal"
-version = "0.4.7"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
+checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
 dependencies = [
 "hermit-abi",
- "io-lifetimes",
+ "libc",
- "rustix 0.37.25",
+ "windows-sys 0.52.0",
 "windows-sys 0.48.0",
 ]
 [[package]]
@@ -2757,6 +2752,15 @@ dependencies = [
 "either",
 ]
 [[package]]
 name = "itertools"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
 dependencies = [
 "either",
 ]
 [[package]]
 name = "itoa"
 version = "1.0.6"
@@ -2871,18 +2875,6 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
 [[package]]
 name = "linux-raw-sys"
 version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -3000,7 +2992,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
- "procfs 0.16.0",
+ "procfs",
 ]
 [[package]]
@@ -3045,7 +3037,7 @@ dependencies = [
 "measured",
 "measured-process",
 "once_cell",
- "procfs 0.14.2",
+ "procfs",
 "prometheus",
 "rand 0.8.5",
 "rand_distr",
@@ -3574,7 +3566,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
 "leaky-bucket",
 "md5",
 "metrics",
@@ -3592,8 +3584,9 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs 0.14.2",
+ "procfs",
 "rand 0.8.5",
 "range-set-blaze",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
@@ -3644,7 +3637,7 @@ dependencies = [
 "hex",
 "humantime",
 "humantime-serde",
- "itertools",
+ "itertools 0.10.5",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
@@ -3702,7 +3695,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "itertools",
+ "itertools 0.10.5",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -4034,7 +4027,7 @@ name = "postgres_connection"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "itertools",
+ "itertools 0.10.5",
 "once_cell",
 "postgres",
 "tokio-postgres",
@@ -4092,7 +4085,7 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools",
+ "itertools 0.10.5",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -4138,21 +4131,6 @@ dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "procfs"
 version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
 dependencies = [
 "bitflags 1.3.2",
 "byteorder",
 "chrono",
 "flate2",
 "hex",
 "lazy_static",
 "rustix 0.36.16",
 ]
 [[package]]
 name = "procfs"
 version = "0.16.0"
@@ -4160,10 +4138,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
 "bitflags 2.4.1",
 "chrono",
 "flate2",
 "hex",
 "lazy_static",
 "procfs-core",
- "rustix 0.38.28",
+ "rustix",
 ]
 [[package]]
@@ -4173,14 +4153,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
 "bitflags 2.4.1",
 "chrono",
 "hex",
 ]
 [[package]]
 name = "prometheus"
-version = "0.13.3"
+version = "0.13.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
+checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
 dependencies = [
 "cfg-if",
 "fnv",
@@ -4188,7 +4169,7 @@ dependencies = [
 "libc",
 "memchr",
 "parking_lot 0.12.1",
- "procfs 0.14.2",
+ "procfs",
 "thiserror",
 ]
@@ -4210,7 +4191,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
 "bytes",
 "heck 0.4.1",
- "itertools",
+ "itertools 0.10.5",
 "lazy_static",
 "log",
 "multimap",
@@ -4231,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
 "anyhow",
- "itertools",
+ "itertools 0.10.5",
 "proc-macro2",
 "quote",
 "syn 1.0.109",
@@ -4288,7 +4269,7 @@ dependencies = [
 "hyper-util",
 "indexmap 2.0.1",
 "ipnet",
- "itertools",
+ "itertools 0.10.5",
 "lasso",
 "md5",
 "measured",
@@ -4464,6 +4445,18 @@ dependencies = [
 "rand_core 0.5.1",
 ]
 [[package]]
 name = "range-set-blaze"
 version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
 dependencies = [
 "gen_ops",
 "itertools 0.12.1",
 "num-integer",
 "num-traits",
 ]
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -4632,7 +4625,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
 "metrics",
 "once_cell",
 "pin-project-lite",
@@ -4942,34 +4935,6 @@ dependencies = [
 "nom",
 ]
 [[package]]
 name = "rustix"
 version = "0.36.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
 dependencies = [
 "bitflags 1.3.2",
 "errno",
 "io-lifetimes",
 "libc",
 "linux-raw-sys 0.1.4",
 "windows-sys 0.45.0",
 ]
 [[package]]
 name = "rustix"
 version = "0.37.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
 dependencies = [
 "bitflags 1.3.2",
 "errno",
 "io-lifetimes",
 "libc",
 "linux-raw-sys 0.3.8",
 "windows-sys 0.48.0",
 ]
 [[package]]
 name = "rustix"
 version = "0.38.28"
@@ -5718,6 +5683,7 @@ dependencies = [
 "aws-config",
 "bytes",
 "camino",
 "chrono",
 "clap",
 "control_plane",
 "diesel",
@@ -5728,7 +5694,7 @@ dependencies = [
 "hex",
 "humantime",
 "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
 "lasso",
 "measured",
 "metrics",
@@ -5737,6 +5703,7 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "r2d2",
 "rand 0.8.5",
 "reqwest 0.12.4",
 "routerify",
 "scopeguard",
@@ -5792,9 +5759,10 @@ dependencies = [
 "either",
 "futures",
 "futures-util",
 "git-version",
 "hex",
 "humantime",
- "itertools",
+ "itertools 0.10.5",
 "once_cell",
 "pageserver",
 "pageserver_api",
@@ -5971,15 +5939,15 @@ dependencies = [
 [[package]]
 name = "tempfile"
-version = "3.5.0"
+version = "3.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
+checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
 dependencies = [
 "cfg-if",
- "fastrand 1.9.0",
+ "fastrand 2.0.0",
- "redox_syscall 0.3.5",
+ "redox_syscall 0.4.1",
- "rustix 0.37.25",
+ "rustix",
- "windows-sys 0.45.0",
+ "windows-sys 0.52.0",
 ]
 [[package]]
@@ -7176,15 +7144,6 @@ dependencies = [
 "windows_x86_64_msvc 0.42.2",
 ]
 [[package]]
 name = "windows-sys"
 version = "0.45.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
 dependencies = [
 "windows-targets 0.42.2",
 ]
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7203,21 +7162,6 @@ dependencies = [
 "windows-targets 0.52.4",
 ]
 [[package]]
 name = "windows-targets"
 version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
 dependencies = [
 "windows_aarch64_gnullvm 0.42.2",
 "windows_aarch64_msvc 0.42.2",
 "windows_i686_gnu 0.42.2",
 "windows_i686_msvc 0.42.2",
 "windows_x86_64_gnu 0.42.2",
 "windows_x86_64_gnullvm 0.42.2",
 "windows_x86_64_msvc 0.42.2",
 ]
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
@@ -7447,7 +7391,7 @@ dependencies = [
 "hmac",
 "hyper 0.14.26",
 "indexmap 1.9.3",
- "itertools",
+ "itertools 0.10.5",
 "libc",
 "log",
 "memchr",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.14"
+procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,6 +4,11 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 [features]
 default = []
 # Enables test specific features.
 testing = []
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,7 +400,15 @@ impl ComputeNode {
    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let mut retry_period_ms = 500.0;
        let mut attempts = 0;
-        let max_attempts = 10;
+        const DEFAULT_ATTEMPTS: u16 = 10;
        #[cfg(feature = "testing")]
        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
            u16::from_str(&v).unwrap()
        } else {
            DEFAULT_ATTEMPTS
        };
        #[cfg(not(feature = "testing"))]
        let max_attempts = DEFAULT_ATTEMPTS;
        loop {
            let result = self.try_get_basebackup(compute_state, lsn);
            match result {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_PAGESERVER_") {
+        if var.starts_with("NEON_") {
            cmd = cmd.env(var, val);
        }
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -514,7 +514,6 @@ impl LocalEnv {
                #[derive(serde::Serialize, serde::Deserialize)]
                // (allow unknown fields, unlike PageServerConf)
                struct PageserverConfigTomlSubset {
                    id: NodeId,
                    listen_pg_addr: String,
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
@@ -526,18 +525,30 @@ impl LocalEnv {
                        .with_context(|| format!("read {:?}", config_toml_path))?,
                )
                .context("parse pageserver.toml")?;
                let identity_toml_path = dentry.path().join("identity.toml");
                #[derive(serde::Serialize, serde::Deserialize)]
                struct IdentityTomlSubset {
                    id: NodeId,
                }
                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
                    &std::fs::read_to_string(&identity_toml_path)
                        .with_context(|| format!("read {:?}", identity_toml_path))?,
                )
                .context("parse identity.toml")?;
                let PageserverConfigTomlSubset {
                    id: config_toml_id,
                    listen_pg_addr,
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
                } = config_toml;
                let IdentityTomlSubset {
                    id: identity_toml_id,
                } = identity_toml;
                let conf = PageServerConf {
                    id: {
                        anyhow::ensure!(
-                            config_toml_id == id,
+                            identity_toml_id == id,
-                            "id mismatch: config_toml.id={config_toml_id} id={id}",
+                            "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
                        );
                        id
                    },
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -127,10 +127,13 @@ impl PageServerNode {
        }
        // Apply the user-provided overrides
-        overrides.push(
+        overrides.push({
-            toml_edit::ser::to_string_pretty(&conf)
+            let mut doc =
-                .expect("we deserialized this from toml earlier"),
+                toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
-        );
+            // `id` is written out to `identity.toml` instead of `pageserver.toml`
            doc.remove("id").expect("it's part of the struct");
            doc.to_string()
        });
        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,6 @@
 use std::collections::HashSet;
 use std::str::FromStr;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -294,6 +295,42 @@ pub enum PlacementPolicy {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
 /// Metadata health record posted from scrubber.
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthRecord {
    pub tenant_shard_id: TenantShardId,
    pub healthy: bool,
    pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthUpdateRequest {
    pub healthy_tenant_shards: HashSet<TenantShardId>,
    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthUpdateResponse {}
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthListUnhealthyResponse {
    pub unhealthy_tenant_shards: Vec<TenantShardId>,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthListOutdatedRequest {
    #[serde(with = "humantime_serde")]
    pub not_scrubbed_for: Duration,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthListOutdatedResponse {
    pub health_records: Vec<MetadataHealthRecord>,
 }
 #[cfg(test)]
 mod test {
    use super::*;
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -355,7 +355,8 @@ impl RemoteStorage for AzureBlobStorage {
                    .blobs()
                    .map(|k| ListingObject{
                        key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into()
+                        last_modified: k.properties.last_modified.into(),
                        size: k.properties.content_length,
                    }
                    );
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,6 +144,7 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
 #[derive(Copy, Clone)]
 pub enum ListingMode {
    WithDelimiter,
    NoDelimiter,
@@ -153,6 +154,7 @@ pub enum ListingMode {
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
    pub size: u64,
 }
 #[derive(Default)]
@@ -194,7 +196,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>>;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
    async fn list(
        &self,
@@ -351,10 +353,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
        match self {
            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -368,6 +368,7 @@ impl RemoteStorage for LocalFs {
                            key: k.clone(),
                            // LocalFs is just for testing, so just specify a dummy time
                            last_modified: SystemTime::now(),
                            size: 0,
                        })
                    }
                })
@@ -411,6 +412,7 @@ impl RemoteStorage for LocalFs {
                            key: RemotePath::from_string(&relative_key).unwrap(),
                            // LocalFs is just for testing
                            last_modified: SystemTime::now(),
                            size: 0,
                        });
                    }
                }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -565,9 +565,12 @@ impl RemoteStorage for S3Bucket {
                        }
                    };
                    let size = object.size.unwrap_or(0) as u64;
                    result.keys.push(ListingObject{
                        key,
-                        last_modified
+                        last_modified,
                        size,
                    });
                    if let Some(mut mk) = max_keys {
                        assert!(mk > 0);
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
        async_stream::stream! {
            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
                .map_err(DownloadError::Other)?;
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
-    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
+    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
    // TODO: join these two?
    Tenant,
-    // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
-    // Should only be used e.g. for status check/tenant creation/list.
+    /// Should only be used e.g. for status check/tenant creation/list.
    PageServerApi,
-    // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+    /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
-    // Should only be used e.g. for status check.
+    /// Should only be used e.g. for status check.
-    // Currently also used for connection from any pageserver to any safekeeper.
+    /// Currently also used for connection from any pageserver to any safekeeper.
    SafekeeperData,
-    // The scope used by pageservers in upcalls to storage controller and cloud control plane
+    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    // Allows access to control plane managment API and some storage controller endpoints.
+    /// Allows access to control plane managment API and some storage controller endpoints.
    Admin,
    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,6 +49,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,3 +1,4 @@
 use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
 fn fixture_path(relative: &str) -> PathBuf {
    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
 }
 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    let mut layer_map = LayerMap::default();
@@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
    // TODO consider compressing this file
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
    // Test with uniform query pattern
@@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
    // Init layer map
    let now = Instant::now();
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
    println!("Finished layer map init in {:?}", now.elapsed());
    // Choose uniformly distributed queries
@@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) {
    group.finish();
 }
 fn bench_visibility_with_map(
    group: &mut BenchmarkGroup<WallTime>,
    layer_map: LayerMap,
    read_points: Vec<Lsn>,
    bench_name: &str,
 ) {
    group.bench_function(bench_name, |b| {
        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
    });
 }
 // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
 fn bench_visibility(c: &mut Criterion) {
    let mut group = c.benchmark_group("visibility");
    {
        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
        let now = Instant::now();
        let mut layer_map = LayerMap::default();
        let mut updates = layer_map.batch_update();
        for i in 0..100_000 {
            let i32 = (i as u32) % 100;
            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
            let layer = PersistentLayerDesc::new_img(
                TenantShardId::unsharded(TenantId::generate()),
                TimelineId::generate(),
                zero.add(10 * i32)..zero.add(10 * i32 + 1),
                Lsn(i),
                0,
            );
            updates.insert_historic(layer);
        }
        updates.flush();
        println!("Finished layer map init in {:?}", now.elapsed());
        let mut read_points = Vec::new();
        for i in (0..100_000).step_by(1000) {
            read_points.push(Lsn(i));
        }
        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
    }
    {
        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
        let read_points = vec![Lsn(0x1C760FA190)];
        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
        let read_points = vec![
            Lsn(0x1C760FA190),
            Lsn(0x000000931BEAD539),
            Lsn(0x000000931BF63011),
            Lsn(0x000000931B33AE68),
            Lsn(0x00000038E67ABFA0),
            Lsn(0x000000931B33AE68),
            Lsn(0x000000914E3F38F0),
            Lsn(0x000000931B33AE68),
        ];
        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
    }
    group.finish();
 }
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_main!(group_1, group_2, group_3);
+criterion_group!(group_4, bench_visibility);
 criterion_main!(group_1, group_2, group_3, group_4);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,11 +17,9 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::WALRECEIVER_RUNTIME;
+use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{
+use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
 };
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -31,11 +29,9 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
    config::PageServerConf,
    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
-    task_mgr::TaskKind,
+    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -129,6 +125,7 @@ fn main() -> anyhow::Result<()> {
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.get_impl, "starting with get page implementation");
    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -593,30 +590,13 @@ fn start_pageserver(
    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let libpq_listener = {
+    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let cancel = CancellationToken::new();
+        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        let libpq_ctx = RequestContext::todo_child(
+        pageserver_listener
-            TaskKind::LibpqEndpointListener,
+            .set_nonblocking(true)
-            // listener task shouldn't need to download anything. (We will
+            .context("set listener to nonblocking")?;
-            // create a separate sub-contexts for each connection, with their
+        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-            // own download behavior. This context is used only to listen and
+    });
            // accept connections.)
            DownloadBehavior::Error,
        );
        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
            "libpq listener",
            page_service::libpq_listener_main(
                tenant_manager.clone(),
                pg_auth,
                pageserver_listener,
                conf.pg_auth_type,
                libpq_ctx,
                cancel.clone(),
            ),
        ));
        LibpqEndpointListener(CancellableTask { task, cancel })
    };
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
@@ -644,7 +624,7 @@ fn start_pageserver(
            shutdown_pageserver.take();
            pageserver::shutdown_pageserver(
                http_endpoint_listener,
-                libpq_listener,
+                page_service,
                consumption_metrics_tasks,
                disk_usage_eviction_task,
                &tenant_manager,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,6 +29,7 @@ use utils::{
    logging::LogFormat,
 };
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -295,6 +296,10 @@ pub struct PageServerConf {
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: L0FlushConfig,
    /// This flag is temporary and will be removed after gradual rollout.
    /// See <https://github.com/neondatabase/neon/issues/8184>.
    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
 }
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -356,8 +361,6 @@ struct PageServerConfigBuilder {
    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
    id: BuilderValue<NodeId>,
    broker_endpoint: BuilderValue<Uri>,
    broker_keepalive_interval: BuilderValue<Duration>,
@@ -403,14 +406,13 @@ struct PageServerConfigBuilder {
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
    l0_flush: BuilderValue<L0FlushConfig>,
    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
 }
 impl PageServerConfigBuilder {
-    fn new(node_id: NodeId) -> Self {
+    fn new() -> Self {
-        let mut this = Self::default();
+        Self::default()
        this.id(node_id);
        this
    }
    #[inline(always)]
@@ -438,7 +440,6 @@ impl PageServerConfigBuilder {
            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
            remote_storage_config: Set(None),
            id: NotSet,
            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                .parse()
                .expect("failed to parse default broker endpoint")),
@@ -496,6 +497,7 @@ impl PageServerConfigBuilder {
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
        }
    }
 }
@@ -568,10 +570,6 @@ impl PageServerConfigBuilder {
        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
    }
    pub fn id(&mut self, node_id: NodeId) {
        self.id = BuilderValue::Set(node_id)
    }
    pub fn log_format(&mut self, log_format: LogFormat) {
        self.log_format = BuilderValue::Set(log_format)
    }
@@ -683,7 +681,11 @@ impl PageServerConfigBuilder {
        self.l0_flush = BuilderValue::Set(value);
    }
-    pub fn build(self) -> anyhow::Result<PageServerConf> {
+    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
    }
    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();
        macro_rules! conf {
@@ -716,7 +718,6 @@ impl PageServerConfigBuilder {
                pg_auth_type,
                auth_validation_public_key_path,
                remote_storage_config,
                id,
                broker_endpoint,
                broker_keepalive_interval,
                log_format,
@@ -741,9 +742,11 @@ impl PageServerConfigBuilder {
                image_compression,
                ephemeral_bytes_per_memory_kb,
                l0_flush,
                compact_level0_phase1_value_access,
            }
            CUSTOM LOGIC
            {
                id: id,
                // TenantConf is handled separately
                default_tenant_conf: TenantConf::default(),
                concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -893,7 +896,7 @@ impl PageServerConf {
        toml: &Document,
        workdir: &Utf8Path,
    ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new(node_id);
+        let mut builder = PageServerConfigBuilder::new();
        builder.workdir(workdir.to_owned());
        let mut t_conf = TenantConfOpt::default();
@@ -924,8 +927,6 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
                            // Logging is not set up yet, so we can't do it.
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -1014,11 +1015,14 @@ impl PageServerConf {
                "l0_flush" => {
                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
                }
                "compact_level0_phase1_value_access" => {
                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
-        let mut conf = builder.build().context("invalid config")?;
+        let mut conf = builder.build(node_id).context("invalid config")?;
        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
            let auth_validation_public_key_path = conf
@@ -1098,6 +1102,7 @@ impl PageServerConf {
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
        }
    }
 }
@@ -1255,7 +1260,6 @@ max_file_descriptors = 333
 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
 id = 10
 metric_collection_interval = '222 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1272,9 +1276,8 @@ background_task_maximum_delay = '334 s'
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
        // we have to create dummy values to overcome the validation errors
-        let config_string = format!(
+        let config_string =
-            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
+            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
        );
        let toml = config_string.parse()?;
        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1341,6 +1344,7 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1415,6 +1419,7 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1579,7 +1584,6 @@ broker_endpoint = '{broker_endpoint}'
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
 id = 222
 [disk_usage_based_eviction]
 max_usage_pct = 80
@@ -1649,7 +1653,6 @@ threshold = "20m"
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
 id = 222
 [tenant_config]
 evictions_low_residence_duration_metric_threshold = "20m"
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,6 +296,11 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => {
                ApiError::ResourceUnavailable(format!("{}", e).into())
            }
            GetActiveTenantError::SwitchedTenant => {
                // in our HTTP handlers, this error doesn't happen
                // TODO: separate error types
                ApiError::ResourceUnavailable("switched tenant".into())
            }
        }
    }
 }
@@ -2129,14 +2134,24 @@ async fn secondary_download_handler(
    let timeout = wait.unwrap_or(Duration::MAX);
-    let status = match tokio::time::timeout(
+    let result = tokio::time::timeout(
        timeout,
        state.secondary_controller.download_tenant(tenant_shard_id),
    )
-    .await
+    .await;
-    {
+
-        // Download job ran to completion.
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
-        Ok(Ok(())) => StatusCode::OK,
+
    let status = match result {
        Ok(Ok(())) => {
            if progress.layers_downloaded >= progress.layers_total {
                // Download job ran to completion
                StatusCode::OK
            } else {
                // Download dropped out without errors because it ran out of time budget
                StatusCode::ACCEPTED
            }
        }
        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
        // okay.  We could get an error here in the unlikely edge case that the tenant
        // was detached between our check above and executing the download job.
@@ -2146,8 +2161,6 @@ async fn secondary_download_handler(
        Err(_) => StatusCode::ACCEPTED,
    };
    let progress = secondary_tenant.progress.lock().unwrap().clone();
    json_response(status, progress)
 }
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -2,13 +2,23 @@ use std::{num::NonZeroUsize, sync::Arc};
 use crate::tenant::ephemeral_file;
-#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
    #[default]
    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct { max_concurrency: NonZeroUsize },
+    Direct {
        max_concurrency: NonZeroUsize,
    },
 }
 impl Default for L0FlushConfig {
    fn default() -> Self {
        Self::Direct {
            // TODO: using num_cpus results in different peak memory usage on different instance types.
            max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
        }
    }
 }
 #[derive(Clone)]
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,8 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
 use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -30,14 +32,13 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
    mgr::{BackgroundPurges, TenantManager},
    secondary,
 };
-use tracing::info;
+use tracing::{info, info_span};
 /// Current storage format version
 ///
@@ -63,7 +64,6 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
 pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
-    libpq_listener: LibpqEndpointListener,
+    page_service: page_service::Listener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -87,10 +87,83 @@ pub async fn shutdown_pageserver(
    exit_code: i32,
 ) {
    use std::time::Duration;
    // If the orderly shutdown below takes too long, we still want to make
    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
    //
    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
    //
    // We use a thread instead of a tokio task because the background runtime is likely busy
    // with the final flushing / uploads. This activity here has priority, and due to lack
    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
    // an effective priority booster.
    let walredo_extraordinary_shutdown_thread_span = {
        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
        span.follows_from(tracing::Span::current());
        span
    };
    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
        let walredo_extraordinary_shutdown_thread_cancel =
            walredo_extraordinary_shutdown_thread_cancel.clone();
        move || {
            let rt = tokio::runtime::Builder::new_current_thread()
                .enable_all()
                .build()
                .unwrap();
            let _entered = rt.enter();
            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
            if let Ok(()) = rt.block_on(tokio::time::timeout(
                Duration::from_secs(8),
                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
            )) {
                info!("cancellation requested");
                return;
            }
            let managers = tenant::WALREDO_MANAGERS
                .lock()
                .unwrap()
                // prevents new walredo managers from being inserted
                .take()
                .expect("only we take()");
            // Use FuturesUnordered to get in queue early for each manager's
            // heavier_once_cell semaphore wait list.
            // Also, for idle tenants that for some reason haven't
            // shut down yet, it's quite likely that we're not going
            // to get Poll::Pending once.
            let mut futs: FuturesUnordered<_> = managers
                .into_iter()
                .filter_map(|(_, mgr)| mgr.upgrade())
                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
                .collect();
            info!(count=%futs.len(), "built FuturesUnordered");
            let mut last_log_at = std::time::Instant::now();
            #[derive(Debug, Default)]
            struct Results {
                initiated: u64,
                already: u64,
            }
            let mut results = Results::default();
            while let Some(we_initiated) = rt.block_on(futs.next()) {
                if we_initiated {
                    results.initiated += 1;
                } else {
                    results.already += 1;
                }
                if last_log_at.elapsed() > Duration::from_millis(100) {
                    info!(remaining=%futs.len(), ?results, "progress");
                    last_log_at = std::time::Instant::now();
                }
            }
            info!(?results, "done");
        }
    });
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    timed(
+    let remaining_connections = timed(
-        libpq_listener.0.shutdown(),
+        page_service.stop_accepting(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -108,7 +181,7 @@ pub async fn shutdown_pageserver(
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        remaining_connections.shutdown(),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
@@ -162,6 +235,12 @@ pub async fn shutdown_pageserver(
        Duration::from_secs(1),
    )
    .await;
    info!("cancel & join walredo_extraordinary_shutdown_thread");
    walredo_extraordinary_shutdown_thread_cancel.cancel();
    walredo_extraordinary_shutdown_thread.join().unwrap();
    info!("walredo_extraordinary_shutdown_thread done");
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
 static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_visible_physical_size",
        "The size of the layer files present in the pageserver's filesystem.",
        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
        "pageserver_resident_physical_size_global",
@@ -613,7 +622,23 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_compression_image_in_bytes_total",
-        "Size of uncompressed data written into image layers"
+        "Size of data written into image layers before compression"
    )
    .expect("failed to define a metric")
 });
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_compression_image_in_bytes_considered",
        "Size of potentially compressible data written into image layers before compression"
    )
    .expect("failed to define a metric")
 });
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_compression_image_in_bytes_chosen",
        "Size of data whose compressed form was written into image layers"
    )
    .expect("failed to define a metric")
 });
@@ -2188,6 +2213,7 @@ pub(crate) struct TimelineMetrics {
    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    pub visible_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub aux_file_size_gauge: IntGauge,
@@ -2310,6 +2336,9 @@ impl TimelineMetrics {
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2364,6 +2393,7 @@ impl TimelineMetrics {
            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            visible_physical_size_gauge,
            current_logical_size_gauge,
            aux_file_size_gauge,
            directory_entries_count_gauge,
@@ -2415,6 +2445,7 @@ impl TimelineMetrics {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -8,8 +8,7 @@ use std::time::Duration;
 pub use pageserver_api::key::{Key, KEY_SIZE};
 /// A 'value' stored for a one Key.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 #[cfg_attr(test, derive(PartialEq))]
 pub enum Value {
    /// An Image value contains a full copy of the value
    Image(Bytes),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -312,14 +313,66 @@ impl std::fmt::Debug for Tenant {
 }
 pub(crate) enum WalRedoManager {
-    Prod(PostgresRedoManager),
+    Prod(WalredoManagerId, PostgresRedoManager),
    #[cfg(test)]
    Test(harness::TestRedoManager),
 }
-impl From<PostgresRedoManager> for WalRedoManager {
+#[derive(thiserror::Error, Debug)]
-    fn from(mgr: PostgresRedoManager) -> Self {
+#[error("pageserver is shutting down")]
-        Self::Prod(mgr)
+pub(crate) struct GlobalShutDown;
 impl WalRedoManager {
    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
        let id = WalredoManagerId::next();
        let arc = Arc::new(Self::Prod(id, mgr));
        let mut guard = WALREDO_MANAGERS.lock().unwrap();
        match &mut *guard {
            Some(map) => {
                map.insert(id, Arc::downgrade(&arc));
                Ok(arc)
            }
            None => Err(GlobalShutDown),
        }
    }
 }
 impl Drop for WalRedoManager {
    fn drop(&mut self) {
        match self {
            Self::Prod(id, _) => {
                let mut guard = WALREDO_MANAGERS.lock().unwrap();
                if let Some(map) = &mut *guard {
                    map.remove(id).expect("new() registers, drop() unregisters");
                }
            }
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
            }
        }
    }
 }
 /// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
 /// the walredo processes outside of the regular order.
 ///
 /// This is necessary to work around a systemd bug where it freezes if there are
 /// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
 #[allow(clippy::type_complexity)]
 pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
 > = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
 #[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
 pub(crate) struct WalredoManagerId(u64);
 impl WalredoManagerId {
    pub fn next() -> Self {
        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
        if id == 0 {
            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
        }
        Self(id)
    }
 }
@@ -331,19 +384,20 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }
 impl WalRedoManager {
-    pub(crate) async fn shutdown(&self) {
+    pub(crate) async fn shutdown(&self) -> bool {
        match self {
-            Self::Prod(mgr) => mgr.shutdown().await,
+            Self::Prod(_, mgr) => mgr.shutdown().await,
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
                true
            }
        }
    }
    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
-            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
+            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
@@ -363,7 +417,7 @@ impl WalRedoManager {
        pg_version: u32,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
-            Self::Prod(mgr) => {
+            Self::Prod(_, mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
                    .await
            }
@@ -377,7 +431,7 @@ impl WalRedoManager {
    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(m) => Some(m.status()),
+            WalRedoManager::Prod(_, m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -386,6 +440,8 @@ impl WalRedoManager {
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
    #[error("Timeline is shutting down")]
    ShuttingDown,
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
        tenant_id: TenantShardId,
@@ -675,11 +731,9 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Arc<Tenant> {
+    ) -> Result<Arc<Tenant>, GlobalShutDown> {
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+        let wal_redo_manager =
-            conf,
+            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
            tenant_shard_id,
        )));
        let TenantSharedResources {
            broker_client,
@@ -878,7 +932,7 @@ impl Tenant {
            }
            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
-        tenant
+        Ok(tenant)
    }
    #[instrument(skip_all)]
@@ -1580,7 +1634,7 @@ impl Tenant {
        self: Arc<Self>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
+        DeleteTimelineFlow::run(&self, timeline_id).await?;
        Ok(())
    }
@@ -6909,7 +6963,11 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::MAX,
+                    key_range: {
                        let mut key = Key::MAX;
                        key.field6 -= 1;
                        Key::MIN..key
                    },
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
@@ -6928,6 +6986,15 @@ mod tests {
            ]
        );
        // increase GC horizon and compact again
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        Ok(())
    }
@@ -7279,6 +7346,15 @@ mod tests {
            );
        }
        // increase GC horizon and compact again
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        Ok(())
    }
@@ -7347,6 +7423,7 @@ mod tests {
                Lsn(0x60),
                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                3,
                None,
            )
            .await
            .unwrap();
@@ -7471,7 +7548,7 @@ mod tests {
            ),
        ];
        let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
@@ -7517,6 +7594,114 @@ mod tests {
        };
        assert_eq!(res, expected_res);
        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
        // the ancestor image in the test case.
        let history = vec![
            (
                key,
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
            ),
            (
                key,
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
            ),
            (
                key,
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
            ),
            (
                key,
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            ),
        ];
        let res = tline
            .generate_key_retention(
                key,
                &history,
                Lsn(0x60),
                &[],
                3,
                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
            )
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
            below_horizon: vec![(
                Lsn(0x60),
                KeyLogAtLsn(vec![(
                    Lsn(0x60),
                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
                )]),
            )],
            above_horizon: KeyLogAtLsn(vec![(
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            )]),
        };
        assert_eq!(res, expected_res);
        let history = vec![
            (
                key,
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
            ),
            (
                key,
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
            ),
            (
                key,
                Lsn(0x60),
                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
            ),
            (
                key,
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            ),
        ];
        let res = tline
            .generate_key_retention(
                key,
                &history,
                Lsn(0x60),
                &[Lsn(0x30)],
                3,
                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
            )
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
            below_horizon: vec![
                (
                    Lsn(0x30),
                    KeyLogAtLsn(vec![(
                        Lsn(0x20),
                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
                    )]),
                ),
                (
                    Lsn(0x60),
                    KeyLogAtLsn(vec![(
                        Lsn(0x60),
                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
                    )]),
                ),
            ],
            above_horizon: KeyLogAtLsn(vec![(
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            )]),
        };
        assert_eq!(res, expected_res);
        Ok(())
    }
@@ -7674,6 +7859,10 @@ mod tests {
        ];
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
                gc_info.cutoffs.time
            };
            for idx in 0..10 {
                assert_eq!(
                    tline
@@ -7684,7 +7873,7 @@ mod tests {
                );
                assert_eq!(
                    tline
-                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
@@ -7710,6 +7899,205 @@ mod tests {
        let cancel = CancellationToken::new();
        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;
        // compact again
        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;
        // increase GC horizon and compact again
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x38);
            guard.cutoffs.space = Lsn(0x38);
        }
        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
        // not increasing the GC horizon and compact again
        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;
        Ok(())
    }
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
        let (tenant, ctx) = harness.load().await;
        fn get_key(id: u32) -> Key {
            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();
        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(2),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x28),
                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
            ),
            (
                get_key(3),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
            ),
        ];
        let delta2 = vec![
            (
                get_key(5),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(6),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
        ];
        let delta3 = vec![
            (
                get_key(8),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
            (
                get_key(9),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
        ];
        let parent_tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![],                       // delta layers
                vec![(Lsn(0x18), img_layer)], // image layers
                Lsn(0x18),
            )
            .await?;
        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
        let branch_tline = tenant
            .branch_timeline_test_with_layers(
                &parent_tline,
                NEW_TIMELINE_ID,
                Some(Lsn(0x18)),
                &ctx,
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![], // image layers
                Lsn(0x50),
            )
            .await?;
        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
        {
            // Update GC info
            let mut guard = parent_tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
                cutoffs: GcCutoffs {
                    time: Lsn(0x10),
                    space: Lsn(0x10),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }
        {
            // Update GC info
            let mut guard = branch_tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
                cutoffs: GcCutoffs {
                    time: Lsn(0x50),
                    space: Lsn(0x50),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }
        let expected_result_at_gc_horizon = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10@0x48"),
            Bytes::from_static(b"value 9@0x10@0x48"),
        ];
        let expected_result_at_lsn_40 = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];
        let verify_result = || async {
            for idx in 0..10 {
                assert_eq!(
                    branch_tline
                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
                );
                assert_eq!(
                    branch_tline
                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_lsn_40[idx]
                );
            }
        };
        verify_result().await;
        let cancel = CancellationToken::new();
        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
        verify_result().await;
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,6 +28,12 @@ use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};
 #[derive(Copy, Clone, Debug)]
 pub struct CompressionInfo {
    pub written_compressed: bool,
    pub compressed_size: Option<usize>,
 }
 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
@@ -273,8 +279,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+        let (buf, res) = self
-            .await
+            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
            .await;
        (buf, res.map(|(off, _compression_info)| off))
    }
    /// Write a blob of data. Returns the offset that it was written to,
@@ -284,8 +292,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
        algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<u64, Error>) {
+    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
        let offset = self.offset;
        let mut compression_info = CompressionInfo {
            written_compressed: false,
            compressed_size: None,
        };
        let len = srcbuf.bytes_init();
@@ -328,7 +340,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        encoder.write_all(&slice[..]).await.unwrap();
                        encoder.shutdown().await.unwrap();
                        let compressed = encoder.into_inner();
                        compression_info.compressed_size = Some(compressed.len());
                        if compressed.len() < len {
                            compression_info.written_compressed = true;
                            let compressed_len = compressed.len();
                            compressed_buf = Some(compressed);
                            (BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -359,7 +373,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        } else {
            self.write_all(srcbuf, ctx).await
        };
-        (srcbuf, res.map(|_| offset))
+        (srcbuf, res.map(|_| (offset, compression_info)))
    }
 }
@@ -416,12 +430,14 @@ pub(crate) mod tests {
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
-                    wtr.write_blob_maybe_compressed(
+                    let res = wtr
-                        blob.clone(),
+                        .write_blob_maybe_compressed(
-                        ctx,
+                            blob.clone(),
-                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                            ctx,
-                    )
+                            ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                    .await
+                        )
                        .await;
                    (res.0, res.1.map(|(off, _)| off))
                } else {
                    wtr.write_blob(blob.clone(), ctx).await
                };
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -296,13 +296,19 @@ where
            let mut stack = Vec::new();
            stack.push((self.root_blk, None));
            let block_cursor = self.reader.block_cursor();
            let mut node_buf = [0_u8; PAGE_SZ];
            while let Some((node_blknum, opt_iter)) = stack.pop() {
-                // Locate the node.
+                // Read the node, through the PS PageCache, into local variable `node_buf`.
-                let node_buf = block_cursor
+                // We could keep the page cache read guard alive, but, at the time of writing,
                // we run quite small PS PageCache s => can't risk running out of
                // PageCache space because this stream isn't consumed fast enough.
                let page_read_guard = block_cursor
                    .read_blk(self.start_blk + node_blknum, ctx)
                    .await?;
                node_buf.copy_from_slice(page_read_guard.as_ref());
                drop(page_read_guard); // drop page cache read guard early
-                let node = OnDiskNode::deparse(node_buf.as_ref())?;
+                let node = OnDiskNode::deparse(&node_buf)?;
                let prefix_len = node.prefix_len as usize;
                let suffix_len = node.suffix_len as usize;
@@ -345,6 +351,7 @@ where
                    Either::Left(idx..node.num_children.into())
                };
                // idx points to the first match now. Keep going from there
                while let Some(idx) = iter.next() {
                    let key_off = idx * suffix_len;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::KeySpaceAccum;
+use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
 use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -61,7 +62,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;
-use super::storage_layer::PersistentLayerDesc;
+use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -871,11 +872,183 @@ impl LayerMap {
        println!("End dump LayerMap");
        Ok(())
    }
    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
    /// where we expect to serve reads.
    ///
    /// This function is O(N) and should be called infrequently.  The caller is responsible for
    /// looking up and updating the Layer objects for these layer descriptors.
    pub fn get_visibility(
        &self,
        mut read_points: Vec<Lsn>,
    ) -> (
        Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
        KeySpace,
    ) {
        // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
        // KeySpace is intended to be composed statically and iterated over.
        struct KeyShadow {
            // Map of range start to range end
            inner: RangeSetBlaze<i128>,
        }
        impl KeyShadow {
            fn new() -> Self {
                Self {
                    inner: Default::default(),
                }
            }
            fn contains(&self, range: Range<Key>) -> bool {
                let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
                self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
                    CheckSortedDisjoint::from([range_incl]),
                ))
            }
            /// Add the input range to the keys covered by self.
            ///
            /// Return true if inserting this range covered some keys that were previously not covered
            fn cover(&mut self, insert: Range<Key>) -> bool {
                let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
                self.inner.ranges_insert(range_incl)
            }
            fn reset(&mut self) {
                self.inner = Default::default();
            }
            fn to_keyspace(&self) -> KeySpace {
                let mut accum = KeySpaceAccum::new();
                for range_incl in self.inner.ranges() {
                    let range = Range {
                        start: Key::from_i128(*range_incl.start()),
                        end: Key::from_i128(range_incl.end() + 1),
                    };
                    accum.add_range(range)
                }
                accum.to_keyspace()
            }
        }
        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
        // and a ReadPoint
        read_points.sort_by_key(|rp| rp.0);
        let mut shadow = KeyShadow::new();
        // We will interleave all our read points and layers into a sorted collection
        enum Item {
            ReadPoint { lsn: Lsn },
            Layer(Arc<PersistentLayerDesc>),
        }
        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
        items.extend(self.iter_historic_layers().map(Item::Layer));
        items.extend(
            read_points
                .into_iter()
                .map(|rp| Item::ReadPoint { lsn: rp }),
        );
        // Ordering: we want to iterate like this:
        // 1. Highest LSNs first
        // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
        // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
        items.sort_by_key(|item| {
            std::cmp::Reverse(match item {
                Item::Layer(layer) => {
                    if layer.is_delta() {
                        (Lsn(layer.get_lsn_range().end.0 - 1), 0)
                    } else {
                        (layer.image_layer_lsn(), 1)
                    }
                }
                Item::ReadPoint { lsn } => (*lsn, 2),
            })
        });
        let mut results = Vec::with_capacity(self.historic.len());
        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
        for item in items {
            let (reached_lsn, is_readpoint) = match &item {
                Item::ReadPoint { lsn } => (lsn, true),
                Item::Layer(layer) => (&layer.lsn_range.start, false),
            };
            maybe_covered_deltas.retain(|d| {
                if *reached_lsn >= d.lsn_range.start && is_readpoint {
                    // We encountered a readpoint within the delta layer: it is visible
                    results.push((d.clone(), LayerVisibilityHint::Visible));
                    false
                } else if *reached_lsn < d.lsn_range.start {
                    // We passed the layer's range without encountering a read point: it is not visible
                    results.push((d.clone(), LayerVisibilityHint::Covered));
                    false
                } else {
                    // We're still in the delta layer: continue iterating
                    true
                }
            });
            match item {
                Item::ReadPoint { lsn: _lsn } => {
                    // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
                    // to assume that the whole key range is visible at the branch point.
                    shadow.reset();
                }
                Item::Layer(layer) => {
                    let visibility = if layer.is_delta() {
                        if shadow.contains(layer.get_key_range()) {
                            // If a layer isn't visible based on current state, we must defer deciding whether
                            // it is truly not visible until we have advanced past the delta's range: we might
                            // encounter another branch point within this delta layer's LSN range.
                            maybe_covered_deltas.push(layer);
                            continue;
                        } else {
                            LayerVisibilityHint::Visible
                        }
                    } else {
                        let modified = shadow.cover(layer.get_key_range());
                        if modified {
                            // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
                            LayerVisibilityHint::Visible
                        } else {
                            // An image layer in a region that was already covered
                            LayerVisibilityHint::Covered
                        }
                    };
                    results.push((layer, visibility));
                }
            }
        }
        // Drain any remaining maybe_covered deltas
        results.extend(
            maybe_covered_deltas
                .into_iter()
                .map(|d| (d, LayerVisibilityHint::Covered)),
        );
        (results, shadow.to_keyspace())
    }
 }
 #[cfg(test)]
 mod tests {
-    use pageserver_api::keyspace::KeySpace;
+    use crate::tenant::{storage_layer::LayerName, IndexPart};
    use pageserver_api::{
        key::DBDIR_KEY,
        keyspace::{KeySpace, KeySpaceRandomAccum},
    };
    use std::{collections::HashMap, path::PathBuf};
    use utils::{
        id::{TenantId, TimelineId},
        shard::TenantShardId,
    };
    use super::*;
@@ -1002,4 +1175,299 @@ mod tests {
            }
        }
    }
    #[test]
    fn layer_visibility_basic() {
        // A simple synthetic input, as a smoke test.
        let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
        let timeline_id = TimelineId::generate();
        let mut layer_map = LayerMap::default();
        let mut updates = layer_map.batch_update();
        const FAKE_LAYER_SIZE: u64 = 1024;
        let inject_delta = |updates: &mut BatchedUpdates,
                            key_start: i128,
                            key_end: i128,
                            lsn_start: u64,
                            lsn_end: u64| {
            let desc = PersistentLayerDesc::new_delta(
                tenant_shard_id,
                timeline_id,
                Range {
                    start: Key::from_i128(key_start),
                    end: Key::from_i128(key_end),
                },
                Range {
                    start: Lsn(lsn_start),
                    end: Lsn(lsn_end),
                },
                1024,
            );
            updates.insert_historic(desc.clone());
            desc
        };
        let inject_image =
            |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
                let desc = PersistentLayerDesc::new_img(
                    tenant_shard_id,
                    timeline_id,
                    Range {
                        start: Key::from_i128(key_start),
                        end: Key::from_i128(key_end),
                    },
                    Lsn(lsn),
                    FAKE_LAYER_SIZE,
                );
                updates.insert_historic(desc.clone());
                desc
            };
        //
        // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
        // we expect to handle.  You can follow these examples through in the same order as they would be processed
        // by the function under test.
        //
        let mut read_points = vec![Lsn(1000)];
        // A delta ahead of any image layer
        let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
        // An image layer is visible and covers some layers beneath itself
        let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
        // A delta layer covered by the image layer: should be covered
        let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
        // A delta layer partially covered by an image layer: should be visible
        let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
        // A delta layer not covered by an image layer: should be visible
        let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
        // An image layer covered by the image layer above: should be covered
        let covered_image = inject_image(&mut updates, 10, 20, 89);
        // An image layer partially covered by an image layer: should be visible
        let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
        // An image layer not covered by an image layer: should be visible
        let not_covered_image = inject_image(&mut updates, 1, 4, 89);
        // A read point: this will make subsequent layers below here visible, even if there are
        // more recent layers covering them.
        read_points.push(Lsn(80));
        // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
        let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
        // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
        // the read point should make it visible, even though its end LSN is covered
        let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
        let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
        read_points.push(Lsn(65));
        let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
        let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
        updates.flush();
        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
        assert_eq!(
            layer_visibilities.get(&ahead_layer),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&visible_covering_img),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&covered_delta),
            Some(&LayerVisibilityHint::Covered)
        );
        assert_eq!(
            layer_visibilities.get(&partially_covered_delta),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&not_covered_delta),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&covered_image),
            Some(&LayerVisibilityHint::Covered)
        );
        assert_eq!(
            layer_visibilities.get(&partially_covered_image),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&not_covered_image),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&covered_delta_below_read_point),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&covering_img_between_read_points),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&covered_delta_between_read_points),
            Some(&LayerVisibilityHint::Covered)
        );
        assert_eq!(
            layer_visibilities.get(&covered_delta_intersects_read_point),
            Some(&LayerVisibilityHint::Visible)
        );
        assert_eq!(
            layer_visibilities.get(&visible_img_after_last_read_point),
            Some(&LayerVisibilityHint::Visible)
        );
        // Shadow should include all the images below the last read point
        let expected_shadow = KeySpace {
            ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
        };
        assert_eq!(shadow, expected_shadow);
    }
    fn fixture_path(relative: &str) -> PathBuf {
        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
    }
    #[test]
    fn layer_visibility_realistic() {
        // Load a large example layermap
        let index_raw = std::fs::read_to_string(fixture_path(
            "test_data/indices/mixed_workload/index_part.json",
        ))
        .unwrap();
        let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
        let tenant_id = TenantId::generate();
        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
        let timeline_id = TimelineId::generate();
        let mut layer_map = LayerMap::default();
        let mut updates = layer_map.batch_update();
        for (layer_name, layer_metadata) in index.layer_metadata {
            let layer_desc = match layer_name {
                LayerName::Image(layer_name) => PersistentLayerDesc {
                    key_range: layer_name.key_range.clone(),
                    lsn_range: layer_name.lsn_as_range(),
                    tenant_shard_id,
                    timeline_id,
                    is_delta: false,
                    file_size: layer_metadata.file_size,
                },
                LayerName::Delta(layer_name) => PersistentLayerDesc {
                    key_range: layer_name.key_range,
                    lsn_range: layer_name.lsn_range,
                    tenant_shard_id,
                    timeline_id,
                    is_delta: true,
                    file_size: layer_metadata.file_size,
                },
            };
            updates.insert_historic(layer_desc);
        }
        updates.flush();
        let read_points = vec![index.metadata.disk_consistent_lsn()];
        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
        for (layer_desc, visibility) in &layer_visibilities {
            tracing::info!("{layer_desc:?}: {visibility:?}");
            eprintln!("{layer_desc:?}: {visibility:?}");
        }
        // The shadow should be non-empty, since there were some image layers
        assert!(!shadow.ranges.is_empty());
        // At least some layers should be marked covered
        assert!(layer_visibilities
            .iter()
            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
        // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
        for (layer_desc, visible) in &layer_visibilities {
            let mut coverage = KeySpaceRandomAccum::new();
            let mut covered_by = Vec::new();
            for other_layer in layer_map.iter_historic_layers() {
                if &other_layer == layer_desc {
                    continue;
                }
                if !other_layer.is_delta()
                    && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
                    && other_layer.key_range.start <= layer_desc.key_range.end
                    && layer_desc.key_range.start <= other_layer.key_range.end
                {
                    coverage.add_range(other_layer.get_key_range());
                    covered_by.push((*other_layer).clone());
                }
            }
            let coverage = coverage.to_keyspace();
            let expect_visible = if coverage.ranges.len() == 1
                && coverage.contains(&layer_desc.key_range.start)
                && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
            {
                LayerVisibilityHint::Covered
            } else {
                LayerVisibilityHint::Visible
            };
            if expect_visible != *visible {
                eprintln!(
                    "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
                    layer_desc.key_range.start,
                    layer_desc.key_range.end,
                    layer_desc.lsn_range.start,
                    layer_desc.lsn_range.end,
                    layer_desc.is_delta()
                );
                if expect_visible == LayerVisibilityHint::Covered {
                    eprintln!("Covered by:");
                    for other in covered_by {
                        eprintln!(
                            "  {}..{} @ {}",
                            other.get_key_range().start,
                            other.get_key_range().end,
                            other.image_layer_lsn()
                        );
                    }
                    if let Some(range) = coverage.ranges.first() {
                        eprintln!(
                            "Total coverage from contributing layers: {}..{}",
                            range.start, range.end
                        );
                    } else {
                        eprintln!(
                            "Total coverage from contributing layers: {:?}",
                            coverage.ranges
                        );
                    }
                }
            }
            assert_eq!(expect_visible, *visible);
        }
        // Sanity: the layer that holds latest data for the DBDIR key should always be visible
        // (just using this key as a key that will always exist for any layermap fixture)
        let dbdir_layer = layer_map
            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
            .unwrap();
        assert!(matches!(
            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
            LayerVisibilityHint::Visible
        ));
    }
 }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        Ok(&self.historic_coverage)
    }
    pub(crate) fn len(&self) -> usize {
        self.layers.len()
    }
 }
 #[test]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
-use super::TenantSharedResources;
+use super::{GlobalShutDown, TenantSharedResources};
 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,8 +116,6 @@ pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
    Zero,
    /// Pick the first shard we find for the TenantId
    First,
    /// Pick the shard that holds this key
    Page(Key),
    /// The shard ID is known: pick the given shard
@@ -667,17 +665,20 @@ pub async fn init_tenant_mgr(
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
-                conf,
+                tenant_spawn(
-                tenant_shard_id,
+                    conf,
-                &tenant_dir_path,
+                    tenant_shard_id,
-                resources.clone(),
+                    &tenant_dir_path,
-                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    resources.clone(),
-                shard_identity,
+                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                Some(init_order.clone()),
+                    shard_identity,
-                SpawnMode::Lazy,
+                    Some(init_order.clone()),
-                &ctx,
+                    SpawnMode::Lazy,
-            )),
+                    &ctx,
                )
                .expect("global shutdown during init_tenant_mgr cannot happen"),
            ),
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -725,7 +726,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Arc<Tenant> {
+) -> Result<Arc<Tenant>, GlobalShutDown> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -1192,7 +1193,10 @@ impl TenantManager {
                    None,
                    spawn_mode,
                    ctx,
-                );
+                )
                .map_err(|_: GlobalShutDown| {
                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
                })?;
                TenantSlot::Attached(tenant)
            }
@@ -1313,7 +1317,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        );
+        )?;
        slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1384,34 +1388,32 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
    ) -> Result<(), DeleteTenantError> {
        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let keys = match self
+        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            .resources
+            Some(&remote_path),
-            .remote_storage
+            remote_storage::ListingMode::NoDelimiter,
-            .list(
+            None,
-                Some(&remote_path),
+            &self.cancel,
-                remote_storage::ListingMode::NoDelimiter,
+        );
-                None,
+        while let Some(chunk) = keys_stream.next().await {
-                &self.cancel,
+            let keys = match chunk {
-            )
+                Ok(listing) => listing.keys,
-            .await
+                Err(remote_storage::DownloadError::Cancelled) => {
-        {
+                    return Err(DeleteTenantError::Cancelled)
-            Ok(listing) => listing.keys,
+                }
-            Err(remote_storage::DownloadError::Cancelled) => {
+                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                return Err(DeleteTenantError::Cancelled)
+                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            }
+            };
            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
        };
-        if keys.is_empty() {
+            if keys.is_empty() {
-            tracing::info!("Remote storage already deleted");
+                tracing::info!("Remote storage already deleted");
-        } else {
+            } else {
-            tracing::info!("Deleting {} keys from remote storage", keys.len());
+                tracing::info!("Deleting {} keys from remote storage", keys.len());
-            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-            self.resources
+                self.resources
-                .remote_storage
+                    .remote_storage
-                .delete_objects(&keys, &self.cancel)
+                    .delete_objects(&keys, &self.cancel)
-                .await?;
+                    .await?;
            }
        }
        Ok(())
@@ -2049,7 +2051,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        );
+        )?;
        slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -2090,7 +2092,6 @@ impl TenantManager {
                    };
                    match selector {
                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return ShardResolveResult::Found(tenant.clone())
                        }
@@ -2172,6 +2173,9 @@ pub(crate) enum GetActiveTenantError {
    /// never happen.
    #[error("Tenant is broken: {0}")]
    Broken(String),
    #[error("reconnect to switch tenant id")]
    SwitchedTenant,
 }
 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1378,6 +1378,18 @@ impl RemoteTimelineClient {
                .dirty
                .layer_metadata
                .drain()
                .filter(|(_file_name, meta)| {
                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
                    // all shards anyway, we _could_ delete these, but
                    // - it creates a potential race if other shards are still
                    //   using the layers while this shard deletes them.
                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
                    //   these timelines are present but corrupt (their index exists but some layers don't)
                    //
                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
                    meta.shard.shard_number == self.tenant_shard_id.shard_number
                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
                })
                .map(|(file_name, meta)| {
                    remote_layer_path(
                        &self.tenant_shard_id.tenant_id,
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,6 +8,9 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 #[cfg(test)]
 pub mod split_writer;
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
@@ -451,20 +454,14 @@ pub enum ValueReconstructResult {
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Default, Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) enum LayerVisibilityHint {
+pub enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
    /// and a readable LSN (the tip of the branch or a child's branch point)
    Visible,
    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
    #[allow(unused)]
    Covered,
    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
    /// state is for when existing layers are constructed while loading a timeline.
    #[default]
    Uninitialized,
 }
 pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -626,23 +623,30 @@ impl LayerAccessStats {
        }
    }
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
+    /// Helper for extracting the visibility hint from the literal value of our inner u64
-        let value = match visibility {
+    fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
-            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
+        match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
        };
        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
    }
    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
            1 => LayerVisibilityHint::Visible,
            0 => LayerVisibilityHint::Covered,
            _ => unreachable!(),
        }
    }
    /// Returns the old value which has been replaced
    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
        let value = match visibility {
            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
            LayerVisibilityHint::Covered => 0x0,
        };
        let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
        self.decode_visibility(old_bits)
    }
    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
        self.decode_visibility(read)
    }
 }
 /// Get a layer descriptor from a layer.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
            .write_blob_maybe_compressed(val, ctx, compression)
            .await;
        let off = match res {
-            Ok(off) => off,
+            Ok((off, _)) => off,
            Err(e) => return (val, Err(anyhow::anyhow!(e))),
        };
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -734,8 +734,22 @@ struct ImageLayerWriterInner {
    // Total uncompressed bytes passed into put_image
    uncompressed_bytes: u64,
    // Like `uncompressed_bytes`,
    // but only of images we might consider for compression
    uncompressed_bytes_eligible: u64,
    // Like `uncompressed_bytes`, but only of images
    // where we have chosen their compressed form
    uncompressed_bytes_chosen: u64,
    // Number of keys in the layer.
    num_keys: usize,
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
    last_written_key: Key,
 }
 impl ImageLayerWriterInner {
@@ -790,6 +804,10 @@ impl ImageLayerWriterInner {
            tree: tree_builder,
            blob_writer,
            uncompressed_bytes: 0,
            uncompressed_bytes_eligible: 0,
            uncompressed_bytes_chosen: 0,
            num_keys: 0,
            last_written_key: Key::MIN,
        };
        Ok(writer)
@@ -808,18 +826,33 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let compression = self.conf.image_compression;
-        self.uncompressed_bytes += img.len() as u64;
+        let uncompressed_len = img.len() as u64;
        self.uncompressed_bytes += uncompressed_len;
        self.num_keys += 1;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
            .await;
        // TODO: re-use the buffer for `img` further upstack
-        let off = res?;
+        let (off, compression_info) = res?;
        if compression_info.compressed_size.is_some() {
            // The image has been considered for compression at least
            self.uncompressed_bytes_eligible += uncompressed_len;
        }
        if compression_info.written_compressed {
            // The image has been compressed
            self.uncompressed_bytes_chosen += uncompressed_len;
        }
        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
        self.tree.append(&keybuf, off)?;
        #[cfg(feature = "testing")]
        {
            self.last_written_key = key;
        }
        Ok(())
    }
@@ -830,6 +863,7 @@ impl ImageLayerWriterInner {
        self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Option<Key>,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -837,6 +871,9 @@ impl ImageLayerWriterInner {
        // Calculate compression ratio
        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
            .inc_by(self.uncompressed_bytes_eligible);
        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
        let mut file = self.blob_writer.into_inner();
@@ -877,11 +914,23 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            self.key_range.clone(),
+            if let Some(end_key) = end_key {
                self.key_range.start..end_key
            } else {
                self.key_range.clone()
            },
            self.lsn,
            metadata.len(),
        );
        #[cfg(feature = "testing")]
        if let Some(end_key) = end_key {
            assert!(
                self.last_written_key < end_key,
                "written key violates end_key range"
            );
        }
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -958,6 +1007,18 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }
    #[cfg(test)]
    /// Estimated size of the image layer.
    pub(crate) fn estimated_size(&self) -> u64 {
        let inner = self.inner.as_ref().unwrap();
        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
    }
    #[cfg(test)]
    pub(crate) fn num_keys(&self) -> usize {
        self.inner.as_ref().unwrap().num_keys
    }
    ///
    /// Finish writing the image layer.
    ///
@@ -966,7 +1027,22 @@ impl ImageLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx).await
+        self.inner.take().unwrap().finish(timeline, ctx, None).await
    }
    #[cfg(test)]
    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
        timeline: &Arc<Timeline>,
        end_key: Key,
        ctx: &RequestContext,
    ) -> anyhow::Result<super::ResidentLayer> {
        self.inner
            .take()
            .unwrap()
            .finish(timeline, ctx, Some(end_key))
            .await
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
    ValuesReconstructState,
 };
 use utils::generation::Generation;
@@ -246,7 +247,7 @@ impl Layer {
                &timeline.generation,
            );
-            let layer = LayerInner::new(
+            LayerInner::new(
                conf,
                timeline,
                local_path,
@@ -254,14 +255,7 @@ impl Layer {
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
-            );
+            )
            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
            layer
                .access_stats
                .set_visibility(super::LayerVisibilityHint::Visible);
            layer
        }));
        let downloaded = resident.expect("just initialized");
@@ -493,6 +487,32 @@ impl Layer {
            }
        }
    }
    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
        let old_visibility = self.access_stats().set_visibility(visibility.clone());
        use LayerVisibilityHint::*;
        match (old_visibility, visibility) {
            (Visible, Covered) => {
                // Subtract this layer's contribution to the visible size metric
                if let Some(tl) = self.0.timeline.upgrade() {
                    tl.metrics
                        .visible_physical_size_gauge
                        .sub(self.0.desc.file_size)
                }
            }
            (Covered, Visible) => {
                // Add this layer's contribution to the visible size metric
                if let Some(tl) = self.0.timeline.upgrade() {
                    tl.metrics
                        .visible_physical_size_gauge
                        .add(self.0.desc.file_size)
                }
            }
            (Covered, Covered) | (Visible, Visible) => {
                // no change
            }
        }
    }
 }
 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -693,6 +713,13 @@ impl Drop for LayerInner {
                timeline.metrics.layer_count_image.dec();
                timeline.metrics.layer_size_image.sub(self.desc.file_size);
            }
            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
                timeline
                    .metrics
                    .visible_physical_size_gauge
                    .sub(self.desc.file_size);
            }
        }
        if !*self.wanted_deleted.get_mut() {
@@ -801,6 +828,12 @@ impl LayerInner {
            timeline.metrics.layer_size_image.add(desc.file_size);
        }
        // New layers are visible by default. This metric is later updated on drop or in set_visibility
        timeline
            .metrics
            .visible_physical_size_gauge
            .add(desc.file_size);
        LayerInner {
            conf,
            debug_str: {
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -41,6 +41,20 @@ pub struct PersistentLayerKey {
    pub is_delta: bool,
 }
 impl std::fmt::Display for PersistentLayerKey {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{}..{} {}..{} is_delta={}",
            self.key_range.start,
            self.key_range.end,
            self.lsn_range.start,
            self.lsn_range.end,
            self.is_delta
        )
    }
 }
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -0,0 +1,244 @@
 use std::sync::Arc;
 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
 use super::{ImageLayerWriter, ResidentLayer};
 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
 /// to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
    target_layer_size: u64,
    generated_layers: Vec<ResidentLayer>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
    lsn: Lsn,
 }
 impl SplitImageLayerWriter {
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        start_key: Key,
        lsn: Lsn,
        target_layer_size: u64,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            target_layer_size,
            inner: ImageLayerWriter::new(
                conf,
                timeline_id,
                tenant_shard_id,
                &(start_key..Key::MAX),
                lsn,
                ctx,
            )
            .await?,
            generated_layers: Vec::new(),
            conf,
            timeline_id,
            tenant_shard_id,
            lsn,
        })
    }
    pub async fn put_image(
        &mut self,
        key: Key,
        img: Bytes,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // The current estimation is an upper bound of the space that the key/image could take
        // because we did not consider compression in this estimation. The resulting image layer
        // could be smaller than the target size.
        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
        if self.inner.num_keys() >= 1
            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
        {
            let next_image_writer = ImageLayerWriter::new(
                self.conf,
                self.timeline_id,
                self.tenant_shard_id,
                &(key..Key::MAX),
                self.lsn,
                ctx,
            )
            .await?;
            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
            self.generated_layers.push(
                prev_image_writer
                    .finish_with_end_key(tline, key, ctx)
                    .await?,
            );
        }
        self.inner.put_image(key, img, ctx).await
    }
    pub(crate) async fn finish(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
    ) -> anyhow::Result<Vec<ResidentLayer>> {
        let Self {
            mut generated_layers,
            inner,
            ..
        } = self;
        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
        Ok(generated_layers)
    }
 }
 #[cfg(test)]
 mod tests {
    use crate::{
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
            storage_layer::AsLayerDesc,
        },
        DEFAULT_PG_VERSION,
    };
    use super::*;
    fn get_key(id: u32) -> Key {
        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
        key.field6 = id;
        key
    }
    fn get_img(id: u32) -> Bytes {
        format!("{id:064}").into()
    }
    fn get_large_img() -> Bytes {
        vec![0; 8192].into()
    }
    #[tokio::test]
    async fn write_one_image() {
        let harness = TenantHarness::create("split_writer_write_one_image")
            .await
            .unwrap();
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await
            .unwrap();
        let mut writer = SplitImageLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
            get_key(0),
            Lsn(0x18),
            4 * 1024 * 1024,
            &ctx,
        )
        .await
        .unwrap();
        writer
            .put_image(get_key(0), get_img(0), &tline, &ctx)
            .await
            .unwrap();
        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
        assert_eq!(layers.len(), 1);
    }
    #[tokio::test]
    async fn write_split() {
        let harness = TenantHarness::create("split_writer_write_split")
            .await
            .unwrap();
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await
            .unwrap();
        let mut writer = SplitImageLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
            get_key(0),
            Lsn(0x18),
            4 * 1024 * 1024,
            &ctx,
        )
        .await
        .unwrap();
        const N: usize = 2000;
        for i in 0..N {
            let i = i as u32;
            writer
                .put_image(get_key(i), get_large_img(), &tline, &ctx)
                .await
                .unwrap();
        }
        let layers = writer
            .finish(&tline, &ctx, get_key(N as u32))
            .await
            .unwrap();
        assert_eq!(layers.len(), N / 512 + 1);
        for idx in 0..layers.len() {
            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
            if idx > 0 {
                assert_eq!(
                    layers[idx - 1].layer_desc().key_range.end,
                    layers[idx].layer_desc().key_range.start
                );
            }
        }
    }
    #[tokio::test]
    async fn write_large_img() {
        let harness = TenantHarness::create("split_writer_write_large_img")
            .await
            .unwrap();
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await
            .unwrap();
        let mut writer = SplitImageLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
            get_key(0),
            Lsn(0x18),
            4 * 1024,
            &ctx,
        )
        .await
        .unwrap();
        writer
            .put_image(get_key(0), get_img(0), &tline, &ctx)
            .await
            .unwrap();
        writer
            .put_image(get_key(1), get_large_img(), &tline, &ctx)
            .await
            .unwrap();
        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
        assert_eq!(layers.len(), 2);
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,6 +3,7 @@ pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
 pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -17,6 +18,7 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
@@ -58,7 +60,7 @@ use std::{
    sync::atomic::AtomicU64,
 };
 use std::{
-    cmp::{max, min, Ordering},
+    cmp::{max, min},
    ops::ControlFlow,
 };
 use std::{
@@ -74,6 +76,7 @@ use crate::{
        metadata::TimelineMetadata,
        storage_layer::PersistentLayerDesc,
    },
    walredo,
 };
 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -140,7 +143,10 @@ use self::walreceiver::{WalReceiver, WalReceiverConf};
 use super::{config::TenantConf, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
+use super::{
    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
    storage_layer::ReadableLayer,
 };
 use super::{
    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
    GcError,
@@ -177,25 +183,6 @@ impl std::fmt::Display for ImageLayerCreationMode {
    }
 }
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) struct Hole {
    key_range: Range<Key>,
    coverage_size: usize,
 }
 impl Ord for Hole {
    fn cmp(&self, other: &Self) -> Ordering {
        other.coverage_size.cmp(&self.coverage_size) // inverse order
    }
 }
 impl PartialOrd for Hole {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
 fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
@@ -443,6 +430,8 @@ pub struct Timeline {
    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
    pub(crate) l0_flush_global_state: L0FlushGlobalState,
    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 }
 pub struct WalReceiverInfo {
@@ -548,7 +537,6 @@ impl GetVectoredError {
    }
 }
 #[derive(Debug)]
 pub struct MissingKeyError {
    key: Key,
    shard: ShardNumber,
@@ -559,6 +547,12 @@ pub struct MissingKeyError {
    backtrace: Option<std::backtrace::Backtrace>,
 }
 impl std::fmt::Debug for MissingKeyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self)
    }
 }
 impl std::fmt::Display for MissingKeyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -1010,7 +1004,10 @@ impl Timeline {
            .for_get_kind(GetKind::Singular)
            .observe(elapsed.as_secs_f64());
-        if cfg!(feature = "testing") && res.is_err() {
+        if cfg!(feature = "testing")
            && res.is_err()
            && !matches!(res, Err(PageReconstructError::Cancelled))
        {
            // it can only be walredo issue
            use std::fmt::Write;
@@ -1929,6 +1926,9 @@ impl Timeline {
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();
        // Ensure Prevent new page service requests from starting.
        self.handles.shutdown();
        // Transition the remote_client into a state where it's only useful for timeline deletion.
        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
        self.remote_client.stop();
@@ -2454,6 +2454,8 @@ impl Timeline {
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
                l0_flush_global_state: resources.l0_flush_global_state,
                handles: Default::default(),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2737,6 +2739,10 @@ impl Timeline {
        // Tenant::create_timeline will wait for these uploads to happen before returning, or
        // on retry.
        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
        drop(guard); // drop write lock, update_layer_visibility will take a read lock.
        self.update_layer_visibility().await;
        info!(
            "loaded layer map with {} layers at {}, total physical size: {}",
            num_layers, disk_consistent_lsn, total_physical_size
@@ -3723,6 +3729,17 @@ impl Timeline {
        &self.shard_identity
    }
    #[inline(always)]
    pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
        ShardTimelineId {
            shard_index: ShardIndex {
                shard_number: self.shard_identity.number,
                shard_count: self.shard_identity.count,
            },
            timeline_id: self.timeline_id,
        }
    }
    ///
    /// Get a handle to the latest layer for appending.
    ///
@@ -4075,6 +4092,21 @@ impl Timeline {
            // release lock on 'layers'
        };
        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
        // This makes us refuse ingest until the new layers have been persisted to the remote.
        self.remote_client
            .wait_completion()
            .await
            .map_err(|e| match e {
                WaitCompletionError::UploadQueueShutDownOrStopped
                | WaitCompletionError::NotInitialized(
                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
                ) => FlushLayerError::Cancelled,
                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
                    FlushLayerError::Other(anyhow!(e).into())
                }
            })?;
        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
@@ -4667,27 +4699,6 @@ impl Timeline {
            }
        }
        // The writer.finish() above already did the fsync of the inodes.
        // We just need to fsync the directory in which these inodes are linked,
        // which we know to be the timeline directory.
        if !image_layers.is_empty() {
            // We use fatal_err() below because the after writer.finish() returns with success,
            // the in-memory state of the filesystem already has the layer file in its final place,
            // and subsequent pageserver code could think it's durable while it really isn't.
            let timeline_dir = VirtualFile::open(
                &self
                    .conf
                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
                ctx,
            )
            .await
            .fatal_err("VirtualFile::open for timeline dir fsync");
            timeline_dir
                .sync_all()
                .await
                .fatal_err("VirtualFile::sync_all timeline dir");
        }
        let mut guard = self.layers.write().await;
        // FIXME: we could add the images to be uploaded *before* returning from here, but right
@@ -4696,6 +4707,9 @@ impl Timeline {
        drop_wlock(guard);
        timer.stop_and_record();
        // Creating image layers may have caused some previously visible layers to be covered
        self.update_layer_visibility().await;
        Ok(image_layers)
    }
@@ -5460,20 +5474,22 @@ impl Timeline {
                } else {
                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                };
-
+                let res = self
                let img = match self
                    .walredo_mgr
                    .as_ref()
                    .context("timeline has no walredo manager")
                    .map_err(PageReconstructError::WalRedo)?
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .await
+                    .await;
-                    .context("reconstruct a page image")
+                let img = match res {
                {
                    Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
+                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                    Err(walredo::Error::Other(e)) => {
                        return Err(PageReconstructError::WalRedo(
                            e.context("reconstruct a page image"),
                        ))
                    }
                };
                Ok(img)
            }
        }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory(
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
+    // Always ensure the lock order is compaction -> gc.
-    let guards = crate::timed(
+    let compaction_lock = timeline.compaction_lock.lock();
-        guards,
+    let compaction_lock = crate::timed(
-        "acquire gc and compaction locks",
+        compaction_lock,
        "acquires compaction lock",
        std::time::Duration::from_secs(5),
    )
    .await;
    let gc_lock = timeline.gc_lock.lock();
    let gc_lock = crate::timed(
        gc_lock,
        "acquires gc lock",
        std::time::Duration::from_secs(5),
    )
    .await;
@@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory(
        .context("fsync_pre_mark_remove")?;
    info!("finished deleting layer files, releasing locks");
-    drop(guards);
+    drop(gc_lock);
    drop(compaction_lock);
    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -206,11 +216,10 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(%inplace))]
+    #[instrument(skip_all)]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -235,11 +244,7 @@ impl DeleteTimelineFlow {
            ))?
        });
-        if inplace {
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
            Self::background(guard, tenant.conf, tenant, &timeline).await?
        } else {
            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
        }
        Ok(())
    }
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -0,0 +1,967 @@
 //! An efficient way to keep the timeline gate open without preventing
 //! timeline shutdown for longer than a single call to a timeline method.
 //!
 //! # Motivation
 //!
 //! On a single page service connection, we're typically serving a single TenantTimelineId.
 //!
 //! Without sharding, there is a single Timeline object to which we dispatch
 //! all requests. For example, a getpage request gets dispatched to the
 //! Timeline::get method of the Timeline object that represents the
 //! (tenant,timeline) of that connection.
 //!
 //! With sharding, for each request that comes in on the connection,
 //! we first have to perform shard routing based on the requested key (=~ page number).
 //! The result of shard routing is a Timeline object.
 //! We then dispatch the request to that Timeline object.
 //!
 //! Regardless of whether the tenant is sharded or not, we want to ensure that
 //! we hold the Timeline gate open while we're invoking the method on the
 //! Timeline object.
 //!
 //! However, we want to avoid the overhead of entering the gate for every
 //! method invocation.
 //!
 //! Further, for shard routing, we want to avoid calling the tenant manager to
 //! resolve the shard for every request. Instead, we want to cache the
 //! routing result so we can bypass the tenant manager for all subsequent requests
 //! that get routed to that shard.
 //!
 //! Regardless of how we accomplish the above, it should not
 //! prevent the Timeline from shutting down promptly.
 //!
 //! # Design
 //!
 //! There are three user-facing data structures:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
 //! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
 //!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
 //!
 //! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
 //!
 //! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
 //! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
 //! A cache miss means we consult the tenant manager for shard routing,
 //! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
 //! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
 //! and the `Arc<HandleInner>` in the `PerTimelineState`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
 //! and find the `Weak<HandleInner>` in the cache.
 //! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
 //!
 //! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
 //! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
 //!
 //! # Memory Management / How The Reference Cycle Is Broken
 //!
 //! The attentive reader may have noticed the strong reference cycle
 //! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
 //!
 //! This cycle is intentional: while it exists, the `Cache` can upgrade its
 //! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
 //!
 //! The cycle is broken by either
 //! - `PerTimelineState::shutdown` or
 //! - dropping the `Cache`.
 //!
 //! Concurrently existing `Handle`s will extend the existence of the cycle.
 //! However, since `Handle`s are short-lived and new `Handle`s are not
 //! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
 //! that extension of the cycle is bounded.
 //!
 //! # Fast Path for Shard Routing
 //!
 //! The `Cache` has a fast path for shard routing to avoid calling into
 //! the tenant manager for every request.
 //!
 //! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
 //!
 //! The current implementation uses the first entry in the hash map
 //! to determine the `ShardParameters` and derive the correct
 //! `ShardIndex` for the requested key.
 //!
 //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
 //!
 //! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
 //! it's a hit.
 //!
 //! ## Cache invalidation
 //!
 //! The insight is that cache invalidation is sufficient and most efficiently done lazily.
 //! The only reasons why an entry in the cache can become stale are:
 //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
 //!    being detached, timeline or shard deleted, or pageserver is shutting down.
 //! 2. We're doing a shard split and new traffic should be routed to the child shards.
 //!
 //! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
 //! timeline has shut down, and when that happens, we remove the entry from the cache.
 //!
 //! Regarding (2), the insight is that it is toally fine to keep dispatching requests
 //! to the parent shard during a shard split. Eventually, the shard split task will
 //! shut down the parent => case (1).
 use std::collections::hash_map;
 use std::collections::HashMap;
 use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::sync::Mutex;
 use std::sync::Weak;
 use pageserver_api::shard::ShardIdentity;
 use tracing::instrument;
 use tracing::trace;
 use utils::id::TimelineId;
 use utils::shard::ShardIndex;
 use utils::shard::ShardNumber;
 use crate::tenant::mgr::ShardSelector;
 /// The requirement for Debug is so that #[derive(Debug)] works in some places.
 pub(crate) trait Types: Sized + std::fmt::Debug {
    type TenantManagerError: Sized + std::fmt::Debug;
    type TenantManager: TenantManager<Self> + Sized;
    type Timeline: ArcTimeline<Self> + Sized;
 }
 /// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
 /// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
 /// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
 #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
 struct CacheId(u64);
 impl CacheId {
    fn next() -> Self {
        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
        if id == 0 {
            panic!("CacheId::new() returned 0, overflow");
        }
        Self(id)
    }
 }
 /// See module-level comment.
 pub(crate) struct Cache<T: Types> {
    id: CacheId,
    map: Map<T>,
 }
 type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
 impl<T: Types> Default for Cache<T> {
    fn default() -> Self {
        Self {
            id: CacheId::next(),
            map: Default::default(),
        }
    }
 }
 #[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
 pub(crate) struct ShardTimelineId {
    pub(crate) shard_index: ShardIndex,
    pub(crate) timeline_id: TimelineId,
 }
 /// See module-level comment.
 pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
 struct HandleInner<T: Types> {
    shut_down: AtomicBool,
    timeline: T::Timeline,
    // The timeline's gate held open.
    _gate_guard: utils::sync::gate::GateGuard,
 }
 /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
 ///
 /// See module-level comment for details.
 pub struct PerTimelineState<T: Types> {
    // None = shutting down
    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
 }
 impl<T: Types> Default for PerTimelineState<T> {
    fn default() -> Self {
        Self {
            handles: Mutex::new(Some(Default::default())),
        }
    }
 }
 /// Abstract view of [`crate::tenant::mgr`], for testability.
 pub(crate) trait TenantManager<T: Types> {
    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
    /// Errors are returned as [`GetError::TenantManager`].
    async fn resolve(
        &self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
    ) -> Result<T::Timeline, T::TenantManagerError>;
 }
 /// Abstract view of an [`Arc<Timeline>`], for testability.
 pub(crate) trait ArcTimeline<T: Types>: Clone {
    fn gate(&self) -> &utils::sync::gate::Gate;
    fn shard_timeline_id(&self) -> ShardTimelineId;
    fn get_shard_identity(&self) -> &ShardIdentity;
    fn per_timeline_state(&self) -> &PerTimelineState<T>;
 }
 /// Errors returned by [`Cache::get`].
 #[derive(Debug)]
 pub(crate) enum GetError<T: Types> {
    TenantManager(T::TenantManagerError),
    TimelineGateClosed,
    PerTimelineStateShutDown,
 }
 /// Internal type used in [`Cache::get`].
 enum RoutingResult<T: Types> {
    FastPath(Handle<T>),
    SlowPath(ShardTimelineId),
    NeedConsultTenantManager,
 }
 impl<T: Types> Cache<T> {
    /// See module-level comment for details.
    ///
    /// Does NOT check for the shutdown state of [`Types::Timeline`].
    /// Instead, the methods of [`Types::Timeline`] that are invoked through
    /// the [`Handle`] are responsible for checking these conditions
    /// and if so, return an error that causes the page service to
    /// close the connection.
    #[instrument(level = "trace", skip_all)]
    pub(crate) async fn get(
        &mut self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
        tenant_manager: &T::TenantManager,
    ) -> Result<Handle<T>, GetError<T>> {
        // terminates because each iteration removes an element from the map
        loop {
            let handle = self
                .get_impl(timeline_id, shard_selector, tenant_manager)
                .await?;
            if handle.0.shut_down.load(Ordering::Relaxed) {
                let removed = self
                    .map
                    .remove(&handle.0.timeline.shard_timeline_id())
                    .expect("invariant of get_impl is that the returned handle is in the map");
                assert!(
                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
                    "shard_timeline_id() incorrect?"
                );
            } else {
                return Ok(handle);
            }
        }
    }
    #[instrument(level = "trace", skip_all)]
    async fn get_impl(
        &mut self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
        tenant_manager: &T::TenantManager,
    ) -> Result<Handle<T>, GetError<T>> {
        let miss: ShardSelector = {
            let routing_state = self.shard_routing(timeline_id, shard_selector);
            match routing_state {
                RoutingResult::FastPath(handle) => return Ok(handle),
                RoutingResult::SlowPath(key) => match self.map.get(&key) {
                    Some(cached) => match cached.upgrade() {
                        Some(upgraded) => return Ok(Handle(upgraded)),
                        None => {
                            trace!("handle cache stale");
                            self.map.remove(&key).unwrap();
                            ShardSelector::Known(key.shard_index)
                        }
                    },
                    None => ShardSelector::Known(key.shard_index),
                },
                RoutingResult::NeedConsultTenantManager => shard_selector,
            }
        };
        self.get_miss(timeline_id, miss, tenant_manager).await
    }
    #[inline(always)]
    fn shard_routing(
        &mut self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
    ) -> RoutingResult<T> {
        loop {
            // terminates because when every iteration we remove an element from the map
            let Some((first_key, first_handle)) = self.map.iter().next() else {
                return RoutingResult::NeedConsultTenantManager;
            };
            let Some(first_handle) = first_handle.upgrade() else {
                // TODO: dedup with get()
                trace!("handle cache stale");
                let first_key_owned = *first_key;
                self.map.remove(&first_key_owned).unwrap();
                continue;
            };
            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
                shard_number: shard_num,
                shard_count: first_handle_shard_identity.count,
            };
            let need_idx = match shard_selector {
                ShardSelector::Page(key) => {
                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
                }
                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
                ShardSelector::Known(shard_idx) => shard_idx,
            };
            let need_shard_timeline_id = ShardTimelineId {
                shard_index: need_idx,
                timeline_id,
            };
            let first_handle_shard_timeline_id = ShardTimelineId {
                shard_index: first_handle_shard_identity.shard_index(),
                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
            };
            if need_shard_timeline_id == first_handle_shard_timeline_id {
                return RoutingResult::FastPath(Handle(first_handle));
            } else {
                return RoutingResult::SlowPath(need_shard_timeline_id);
            }
        }
    }
    #[instrument(level = "trace", skip_all)]
    #[inline(always)]
    async fn get_miss(
        &mut self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
        tenant_manager: &T::TenantManager,
    ) -> Result<Handle<T>, GetError<T>> {
        match tenant_manager.resolve(timeline_id, shard_selector).await {
            Ok(timeline) => {
                let key = timeline.shard_timeline_id();
                match &shard_selector {
                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
                }
                let gate_guard = match timeline.gate().enter() {
                    Ok(guard) => guard,
                    Err(_) => {
                        return Err(GetError::TimelineGateClosed);
                    }
                };
                trace!("creating new HandleInner");
                let handle = Arc::new(
                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
                    // so we can identify reference cycle bugs.
                    HandleInner {
                        shut_down: AtomicBool::new(false),
                        _gate_guard: gate_guard,
                        timeline: timeline.clone(),
                    },
                );
                let handle = {
                    let mut lock_guard = timeline
                        .per_timeline_state()
                        .handles
                        .lock()
                        .expect("mutex poisoned");
                    match &mut *lock_guard {
                        Some(per_timeline_state) => {
                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
                            assert!(replaced.is_none(), "some earlier code left a stale handle");
                            match self.map.entry(key) {
                                hash_map::Entry::Occupied(_o) => {
                                    // This cannot not happen because
                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
                                    //    while we were waiting for the tenant manager.
                                    unreachable!()
                                }
                                hash_map::Entry::Vacant(v) => {
                                    v.insert(Arc::downgrade(&handle));
                                    handle
                                }
                            }
                        }
                        None => {
                            return Err(GetError::PerTimelineStateShutDown);
                        }
                    }
                };
                Ok(Handle(handle))
            }
            Err(e) => Err(GetError::TenantManager(e)),
        }
    }
 }
 impl<T: Types> PerTimelineState<T> {
    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
    /// to the [`Types::Timeline`] that embeds this per-timeline state.
    /// Even if [`TenantManager::resolve`] would still resolve to it.
    ///
    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
    /// That's ok because they're short-lived. See module-level comment for details.
    #[instrument(level = "trace", skip_all)]
    pub(super) fn shutdown(&self) {
        let handles = self
            .handles
            .lock()
            .expect("mutex poisoned")
            // NB: this .take() sets locked to None.
            // That's what makes future `Cache::get` misses fail.
            // Cache hits are taken care of below.
            .take();
        let Some(handles) = handles else {
            trace!("already shut down");
            return;
        };
        for handle in handles.values() {
            // Make hits fail.
            handle.shut_down.store(true, Ordering::Relaxed);
        }
        drop(handles);
    }
 }
 impl<T: Types> std::ops::Deref for Handle<T> {
    type Target = T::Timeline;
    fn deref(&self) -> &Self::Target {
        &self.0.timeline
    }
 }
 #[cfg(test)]
 impl<T: Types> Drop for HandleInner<T> {
    fn drop(&mut self) {
        trace!("HandleInner dropped");
    }
 }
 // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
 impl<T: Types> Drop for Cache<T> {
    fn drop(&mut self) {
        for (_, weak) in self.map.drain() {
            if let Some(strong) = weak.upgrade() {
                // handle is still being kept alive in PerTimelineState
                let timeline = strong.timeline.per_timeline_state();
                let mut handles = timeline.handles.lock().expect("mutex poisoned");
                if let Some(handles) = &mut *handles {
                    let Some(removed) = handles.remove(&self.id) else {
                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
                        continue;
                    };
                    assert!(Arc::ptr_eq(&removed, &strong));
                }
            }
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use pageserver_api::{
        key::{rel_block_to_key, Key, DBDIR_KEY},
        models::ShardParameters,
        reltag::RelTag,
        shard::ShardStripeSize,
    };
    use utils::shard::ShardCount;
    use super::*;
    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
    #[derive(Debug)]
    struct TestTypes;
    impl Types for TestTypes {
        type TenantManagerError = anyhow::Error;
        type TenantManager = StubManager;
        type Timeline = Arc<StubTimeline>;
    }
    struct StubManager {
        shards: Vec<Arc<StubTimeline>>,
    }
    struct StubTimeline {
        gate: utils::sync::gate::Gate,
        id: TimelineId,
        shard: ShardIdentity,
        per_timeline_state: PerTimelineState<TestTypes>,
        myself: Weak<StubTimeline>,
    }
    impl StubTimeline {
        fn getpage(&self) {
            // do nothing
        }
    }
    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
        fn gate(&self) -> &utils::sync::gate::Gate {
            &self.gate
        }
        fn shard_timeline_id(&self) -> ShardTimelineId {
            ShardTimelineId {
                shard_index: self.shard.shard_index(),
                timeline_id: self.id,
            }
        }
        fn get_shard_identity(&self) -> &ShardIdentity {
            &self.shard
        }
        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
            &self.per_timeline_state
        }
    }
    impl TenantManager<TestTypes> for StubManager {
        async fn resolve(
            &self,
            timeline_id: TimelineId,
            shard_selector: ShardSelector,
        ) -> anyhow::Result<Arc<StubTimeline>> {
            for timeline in &self.shards {
                if timeline.id == timeline_id {
                    match &shard_selector {
                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
                            return Ok(Arc::clone(timeline));
                        }
                        ShardSelector::Zero => continue,
                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
                            return Ok(Arc::clone(timeline));
                        }
                        ShardSelector::Page(_) => continue,
                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
                            return Ok(Arc::clone(timeline));
                        }
                        ShardSelector::Known(_) => continue,
                    }
                }
            }
            anyhow::bail!("not found")
        }
    }
    #[tokio::test(start_paused = true)]
    async fn test_timeline_shutdown() {
        crate::tenant::harness::setup_logging();
        let timeline_id = TimelineId::generate();
        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
            shard: ShardIdentity::unsharded(),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let mgr = StubManager {
            shards: vec![shard0.clone()],
        };
        let key = DBDIR_KEY;
        let mut cache = Cache::<TestTypes>::default();
        //
        // fill the cache
        //
        assert_eq!(
            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
            (2, 1),
            "strong: shard0, mgr; weak: myself"
        );
        let handle: Handle<_> = cache
            .get(timeline_id, ShardSelector::Page(key), &mgr)
            .await
            .expect("we have the timeline");
        let handle_inner_weak = Arc::downgrade(&handle.0);
        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
        assert_eq!(
            (
                Weak::strong_count(&handle_inner_weak),
                Weak::weak_count(&handle_inner_weak)
            ),
            (2, 2),
            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
        );
        assert_eq!(cache.map.len(), 1);
        assert_eq!(
            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
            (3, 1),
            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
        );
        drop(handle);
        assert_eq!(
            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
            (3, 1),
            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
        );
        //
        // demonstrate that Handle holds up gate closure
        // but shutdown prevents new handles from being handed out
        //
        tokio::select! {
            _ = shard0.gate.close() => {
                panic!("cache and per-timeline handler state keep cache open");
            }
            _ = tokio::time::sleep(FOREVER) => {
                // NB: first poll of close() makes it enter closing state
            }
        }
        let handle = cache
            .get(timeline_id, ShardSelector::Page(key), &mgr)
            .await
            .expect("we have the timeline");
        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
        // SHUTDOWN
        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
        assert_eq!(
            1,
            Weak::strong_count(&handle_inner_weak),
            "through local var handle"
        );
        assert_eq!(
            cache.map.len(),
            1,
            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
        );
        assert_eq!(
            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
            (3, 1),
            "strong: handleinner(via handle), shard0, mgr; weak: myself"
        );
        // this handle is perfectly usable
        handle.getpage();
        cache
            .get(timeline_id, ShardSelector::Page(key), &mgr)
            .await
            .err()
            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
        assert_eq!(
            cache.map.len(),
            0,
            "first access after shutdown cleans up the Weak's from the cache"
        );
        tokio::select! {
            _ = shard0.gate.close() => {
                panic!("handle is keeping gate open");
            }
            _ = tokio::time::sleep(FOREVER) => { }
        }
        drop(handle);
        assert_eq!(
            0,
            Weak::strong_count(&handle_inner_weak),
            "the HandleInner destructor already ran"
        );
        assert_eq!(
            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
            (2, 1),
            "strong: shard0, mgr; weak: myself"
        );
        // closing gate succeeds after dropping handle
        tokio::select! {
            _ = shard0.gate.close() => { }
            _ = tokio::time::sleep(FOREVER) => {
                panic!("handle is dropped, no other gate holders exist")
            }
        }
        // map gets cleaned on next lookup
        cache
            .get(timeline_id, ShardSelector::Page(key), &mgr)
            .await
            .err()
            .expect("documented behavior: can't get new handle after shutdown");
        assert_eq!(cache.map.len(), 0);
        // ensure all refs to shard0 are gone and we're not leaking anything
        let myself = Weak::clone(&shard0.myself);
        drop(shard0);
        drop(mgr);
        assert_eq!(Weak::strong_count(&myself), 0);
    }
    #[tokio::test]
    async fn test_multiple_timelines_and_deletion() {
        crate::tenant::harness::setup_logging();
        let timeline_a = TimelineId::generate();
        let timeline_b = TimelineId::generate();
        assert_ne!(timeline_a, timeline_b);
        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_a,
            shard: ShardIdentity::unsharded(),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_b,
            shard: ShardIdentity::unsharded(),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let mut mgr = StubManager {
            shards: vec![timeline_a.clone(), timeline_b.clone()],
        };
        let key = DBDIR_KEY;
        let mut cache = Cache::<TestTypes>::default();
        cache
            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
            .await
            .expect("we have it");
        cache
            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
            .await
            .expect("we have it");
        assert_eq!(cache.map.len(), 2);
        // delete timeline A
        timeline_a.per_timeline_state.shutdown();
        mgr.shards.retain(|t| t.id != timeline_a.id);
        assert!(
            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
                .await
                .is_err(),
            "broken StubManager implementation"
        );
        assert_eq!(
            cache.map.len(),
            2,
            "cache still has a Weak handle to Timeline A"
        );
        cache
            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
            .await
            .err()
            .expect("documented behavior: can't get new handle after shutdown");
        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
        cache
            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
            .await
            .expect("we still have it");
    }
    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
        rel_block_to_key(
            RelTag {
                spcnode: 1663,
                dbnode: 208101,
                relnode: 2620,
                forknum: 0,
            },
            shard.0 as u32 * params.stripe_size.0,
        )
    }
    #[tokio::test(start_paused = true)]
    async fn test_shard_split() {
        crate::tenant::harness::setup_logging();
        let timeline_id = TimelineId::generate();
        let parent = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
            shard: ShardIdentity::unsharded(),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let child_params = ShardParameters {
            count: ShardCount(2),
            stripe_size: ShardStripeSize::default(),
        };
        let child0 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let child1 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
        let mut cache = Cache::<TestTypes>::default();
        // fill the cache with the parent
        for i in 0..2 {
            let handle = cache
                .get(
                    timeline_id,
                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
                    &StubManager {
                        shards: vec![parent.clone()],
                    },
                )
                .await
                .expect("we have it");
            assert!(
                Weak::ptr_eq(&handle.myself, &parent.myself),
                "mgr returns parent first"
            );
            drop(handle);
        }
        //
        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
        //
        // while we haven't shut down the parent, the cache will return the cached parent, even
        // if the tenant manager returns the child
        for i in 0..2 {
            let handle = cache
                .get(
                    timeline_id,
                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
                    &StubManager {
                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
                    },
                )
                .await
                .expect("we have it");
            assert!(
                Weak::ptr_eq(&handle.myself, &parent.myself),
                "mgr returns parent"
            );
            drop(handle);
        }
        let parent_handle = cache
            .get(
                timeline_id,
                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
                &StubManager {
                    shards: vec![parent.clone()],
                },
            )
            .await
            .expect("we have it");
        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
        // invalidate the cache
        parent.per_timeline_state.shutdown();
        // the cache will now return the child, even though the parent handle still exists
        for i in 0..2 {
            let handle = cache
                .get(
                    timeline_id,
                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
                    &StubManager {
                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
                    },
                )
                .await
                .expect("we have it");
            assert!(
                Weak::ptr_eq(
                    &handle.myself,
                    &child_shards_by_shard_number[i as usize].myself
                ),
                "mgr returns child"
            );
            drop(handle);
        }
        // all the while the parent handle kept the parent gate open
        tokio::select! {
            _ = parent_handle.gate.close() => {
                panic!("parent handle is keeping gate open");
            }
            _ = tokio::time::sleep(FOREVER) => { }
        }
        drop(parent_handle);
        tokio::select! {
            _ = parent.gate.close() => { }
            _ = tokio::time::sleep(FOREVER) => {
                panic!("parent handle is dropped, no other gate holders exist")
            }
        }
    }
    #[tokio::test(start_paused = true)]
    async fn test_connection_handler_exit() {
        crate::tenant::harness::setup_logging();
        let timeline_id = TimelineId::generate();
        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
            shard: ShardIdentity::unsharded(),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let mgr = StubManager {
            shards: vec![shard0.clone()],
        };
        let key = DBDIR_KEY;
        // Simulate 10 connections that's opened, used, and closed
        let mut used_handles = vec![];
        for _ in 0..10 {
            let mut cache = Cache::<TestTypes>::default();
            let handle = {
                let handle = cache
                    .get(timeline_id, ShardSelector::Page(key), &mgr)
                    .await
                    .expect("we have the timeline");
                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
                handle
            };
            handle.getpage();
            used_handles.push(Arc::downgrade(&handle.0));
        }
        // No handles exist, thus gates are closed and don't require shutdown
        assert!(used_handles
            .iter()
            .all(|weak| Weak::strong_count(weak) == 0));
        // ... thus the gate should close immediately, even without shutdown
        tokio::select! {
            _ = shard0.gate.close() => { }
            _ = tokio::time::sleep(FOREVER) => {
                panic!("handle is dropped, no other gate holders exist")
            }
        }
    }
 }
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -35,6 +35,10 @@ impl LayerManager {
        self.layer_fmgr.get_from_desc(desc)
    }
    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
        self.layer_fmgr.get_from_key(desc)
    }
    /// Get an immutable reference to the layer map.
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
@@ -365,16 +369,20 @@ impl<T> Default for LayerFileManager<T> {
 }
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
-            .get(&desc.key())
+            .get(key)
-            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
+            .with_context(|| format!("get layer from key: {}", key))
            .expect("not found")
            .clone()
    }
    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        self.get_from_key(&desc.key())
    }
    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
        self.0.contains_key(key)
    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -241,6 +241,9 @@ impl PostgresRedoManager {
    /// Shut down the WAL redo manager.
    ///
    /// Returns `true` if this call was the one that initiated shutdown.
    /// `true` may be observed by no caller if the first caller stops polling.
    ///
    /// After this future completes
    /// - no redo process is running
    /// - no new redo process will be spawned
@@ -250,22 +253,32 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn shutdown(&self) {
+    pub async fn shutdown(&self) -> bool {
        // prevent new processes from being spawned
-        let permit = match self.redo_process.get_or_init_detached().await {
+        let maybe_permit = match self.redo_process.get_or_init_detached().await {
            Ok(guard) => {
-                let (proc, permit) = guard.take_and_deinit();
+                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
-                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                    None
-                permit
+                } else {
                    let (proc, permit) = guard.take_and_deinit();
                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
                    Some(permit)
                }
            }
-            Err(permit) => permit,
+            Err(permit) => Some(permit),
        };
        let it_was_us = if let Some(permit) = maybe_permit {
            self.redo_process
                .set(ProcessOnceCell::ManagerShutDown, permit);
            true
        } else {
            false
        };
        self.redo_process
            .set(ProcessOnceCell::ManagerShutDown, permit);
        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
        // for the underlying process.
        self.launched_processes.close().await;
        it_was_us
    }
    /// This type doesn't have its own background task to check for idleness: we
--- a/pageserver/test_data/indices/mixed_workload/README.md
+++ b/pageserver/test_data/indices/mixed_workload/README.md
@@ -0,0 +1,7 @@
 # This was captured from one shard of a large tenant in staging.
 # It has a mixture of deltas and image layers, >1000 layers in total.
 # This is suitable for general smoke tests that want an index which is not
 # trivially small, but doesn't contain weird/pathological cases.
--- a/pageserver/test_data/indices/mixed_workload/index_part.json
+++ b/pageserver/test_data/indices/mixed_workload/index_part.json
--- a/poetry.lock
+++ b/poetry.lock
@@ -870,6 +870,96 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 [[package]]
 name = "clickhouse-connect"
 version = "0.7.17"
 description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
 optional = false
 python-versions = "~=3.8"
 files = [
    {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
    {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
    {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
    {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
    {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
    {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
 ]
 [package.dependencies]
 certifi = "*"
 lz4 = "*"
 pytz = "*"
 urllib3 = ">=1.26"
 zstandard = "*"
 [package.extras]
 arrow = ["pyarrow"]
 numpy = ["numpy"]
 orjson = ["orjson"]
 pandas = ["pandas"]
 sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
 tzlocal = ["tzlocal (>=4.0)"]
 [[package]]
 name = "colorama"
 version = "0.4.5"
@@ -1470,6 +1560,56 @@ files = [
    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]
 [[package]]
 name = "lz4"
 version = "4.3.3"
 description = "LZ4 Bindings for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
    {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
    {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
    {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
    {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
    {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
    {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
    {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
    {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
    {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
    {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
    {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
    {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
    {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
    {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
    {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
    {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
    {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
    {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
    {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
    {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
    {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
 ]
 [package.extras]
 docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
 flake8 = ["flake8"]
 tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -2361,6 +2501,17 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 [[package]]
 name = "pytz"
 version = "2024.1"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
 files = [
    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
 ]
 [[package]]
 name = "pywin32"
 version = "301"
@@ -3206,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
+content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -21,6 +21,7 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod patch_control_file;
 pub mod pull_timeline;
 pub mod rate_limit;
 pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
@@ -53,6 +54,7 @@ pub mod defaults {
    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
    pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
    pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
    pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;
    // By default, our required residency before eviction is the same as the period that passes
    // before uploading a partial segment, so that in normal operation the eviction can happen
--- a/safekeeper/src/rate_limit.rs
+++ b/safekeeper/src/rate_limit.rs
@@ -0,0 +1,49 @@
 use std::sync::Arc;
 use rand::Rng;
 use crate::metrics::MISC_OPERATION_SECONDS;
 /// Global rate limiter for background tasks.
 #[derive(Clone)]
 pub struct RateLimiter {
    partial_backup: Arc<tokio::sync::Semaphore>,
    eviction: Arc<tokio::sync::Semaphore>,
 }
 impl RateLimiter {
    /// Create a new rate limiter.
    /// - `partial_backup_max`: maximum number of concurrent partial backups.
    /// - `eviction_max`: maximum number of concurrent timeline evictions.
    pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
        Self {
            partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
            eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
        }
    }
    /// Get a permit for partial backup. This will block if the maximum number of concurrent
    /// partial backups is reached.
    pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
        let _timer = MISC_OPERATION_SECONDS
            .with_label_values(&["partial_permit_acquire"])
            .start_timer();
        self.partial_backup
            .clone()
            .acquire_owned()
            .await
            .expect("semaphore is closed")
    }
    /// Try to get a permit for timeline eviction. This will return None if the maximum number of
    /// concurrent timeline evictions is reached.
    pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
        self.eviction.clone().try_acquire_owned().ok()
    }
 }
 /// Generate a random duration that is a fraction of the given duration.
 pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
    let randf64 = rand::thread_rng().gen_range(0.0..1.0);
    duration.mul_f64(randf64)
 }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -25,6 +25,7 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
@@ -36,7 +37,7 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
+use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,7 +5,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
 use std::time::Instant;
 use tokio::{
    fs::File,
    io::{AsyncRead, AsyncWriteExt},
@@ -15,6 +14,7 @@ use utils::crashsafe::durable_rename;
 use crate::{
    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
    rate_limit::rand_duration,
    timeline_manager::{Manager, StateSnapshot},
    wal_backup,
    wal_backup_partial::{self, PartialRemoteSegment},
@@ -50,7 +50,6 @@ impl Manager {
                .flush_lsn
                .segment_number(self.wal_seg_size)
                == self.last_removed_segno + 1
            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
    }
    /// Evict the timeline to remote storage.
@@ -112,7 +111,8 @@ impl Manager {
            return;
        }
-        self.resident_since = Instant::now();
+        self.evict_not_before =
            tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
        info!("successfully restored evicted timeline");
    }
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
 use crate::{
    control_file::{FileStorage, Storage},
    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
    rate_limit::{rand_duration, RateLimiter},
    recovery::recovery_main,
    remove_wal::calc_horizon_lsn,
    safekeeper::Term,
@@ -32,7 +33,7 @@ use crate::{
    timeline_guard::{AccessService, GuardId, ResidenceGuard},
    timelines_set::{TimelineSetGuard, TimelinesSet},
    wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
+    wal_backup_partial::{self, PartialRemoteSegment},
    SafeKeeperConf,
 };
@@ -185,11 +186,11 @@ pub(crate) struct Manager {
    // misc
    pub(crate) access_service: AccessService,
-    pub(crate) partial_backup_rate_limiter: RateLimiter,
+    pub(crate) global_rate_limiter: RateLimiter,
    // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
    // evict them if they go inactive very soon after being restored.
-    pub(crate) resident_since: std::time::Instant,
+    pub(crate) evict_not_before: Instant,
 }
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -202,7 +203,7 @@ pub async fn main_task(
    broker_active_set: Arc<TimelinesSet>,
    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
    mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
-    partial_backup_rate_limiter: RateLimiter,
+    global_rate_limiter: RateLimiter,
 ) {
    tli.set_status(Status::Started);
@@ -220,7 +221,7 @@ pub async fn main_task(
        conf,
        broker_active_set,
        manager_tx,
-        partial_backup_rate_limiter,
+        global_rate_limiter,
    )
    .await;
@@ -254,9 +255,29 @@ pub async fn main_task(
            mgr.set_status(Status::UpdatePartialBackup);
            mgr.update_partial_backup(&state_snapshot).await;
-            if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
+            let now = Instant::now();
-                mgr.set_status(Status::EvictTimeline);
+            if mgr.evict_not_before > now {
-                mgr.evict_timeline().await;
+                // we should wait until evict_not_before
                update_next_event(&mut next_event, mgr.evict_not_before);
            }
            if mgr.conf.enable_offload
                && mgr.evict_not_before <= now
                && mgr.ready_for_eviction(&next_event, &state_snapshot)
            {
                // check rate limiter and evict timeline if possible
                match mgr.global_rate_limiter.try_acquire_eviction() {
                    Some(_permit) => {
                        mgr.set_status(Status::EvictTimeline);
                        mgr.evict_timeline().await;
                    }
                    None => {
                        // we can't evict timeline now, will try again later
                        mgr.evict_not_before =
                            Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
                        update_next_event(&mut next_event, mgr.evict_not_before);
                    }
                }
            }
        }
@@ -334,11 +355,10 @@ impl Manager {
        conf: SafeKeeperConf,
        broker_active_set: Arc<TimelinesSet>,
        manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
-        partial_backup_rate_limiter: RateLimiter,
+        global_rate_limiter: RateLimiter,
    ) -> Manager {
        let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
        Manager {
            conf,
            wal_seg_size: tli.get_wal_seg_size().await,
            walsenders: tli.get_walsenders().clone(),
            state_version_rx: tli.get_state_version_rx(),
@@ -353,8 +373,10 @@ impl Manager {
            partial_backup_uploaded,
            access_service: AccessService::new(manager_tx),
            tli,
-            partial_backup_rate_limiter,
+            global_rate_limiter,
-            resident_since: std::time::Instant::now(),
+            // to smooth out evictions spike after restart
            evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
            conf,
        }
    }
@@ -541,7 +563,7 @@ impl Manager {
        self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
            self.wal_resident_timeline(),
            self.conf.clone(),
-            self.partial_backup_rate_limiter.clone(),
+            self.global_rate_limiter.clone(),
        )));
    }
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -2,10 +2,11 @@
 //! All timelines should always be present in this map, this is done by loading them
 //! all from the disk on startup and keeping them in memory.
 use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
 use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -31,7 +32,7 @@ struct GlobalTimelinesState {
    conf: Option<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
-    partial_backup_rate_limiter: RateLimiter,
+    global_rate_limiter: RateLimiter,
 }
 // Used to prevent concurrent timeline loading.
@@ -50,7 +51,7 @@ impl GlobalTimelinesState {
        (
            self.get_conf().clone(),
            self.broker_active_set.clone(),
-            self.partial_backup_rate_limiter.clone(),
+            self.global_rate_limiter.clone(),
        )
    }
@@ -85,7 +86,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
        conf: None,
        broker_active_set: Arc::new(TimelinesSet::default()),
        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
-        partial_backup_rate_limiter: RateLimiter::new(1),
+        global_rate_limiter: RateLimiter::new(1, 1),
    })
 });
@@ -99,7 +100,10 @@ impl GlobalTimelines {
        // lock, so use explicit block
        let tenants_dir = {
            let mut state = TIMELINES_STATE.lock().unwrap();
-            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
+            state.global_rate_limiter = RateLimiter::new(
                conf.partial_backup_concurrency,
                DEFAULT_EVICTION_CONCURRENCY,
            );
            state.conf = Some(conf);
            // Iterate through all directories and load tenants for all directories
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,8 +18,6 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 use std::sync::Arc;
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -30,6 +28,7 @@ use utils::lsn::Lsn;
 use crate::{
    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
    rate_limit::{rand_duration, RateLimiter},
    safekeeper::Term,
    timeline::WalResidentTimeline,
    timeline_manager::StateSnapshot,
@@ -37,30 +36,6 @@ use crate::{
    SafeKeeperConf,
 };
 #[derive(Clone)]
 pub struct RateLimiter {
    semaphore: Arc<tokio::sync::Semaphore>,
 }
 impl RateLimiter {
    pub fn new(permits: usize) -> Self {
        Self {
            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
        }
    }
    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
        let _timer = MISC_OPERATION_SECONDS
            .with_label_values(&["partial_permit_acquire"])
            .start_timer();
        self.semaphore
            .clone()
            .acquire_owned()
            .await
            .expect("semaphore is closed")
    }
 }
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
    /// Upload is in progress. This status should be used only for garbage collection,
@@ -352,6 +327,7 @@ pub async fn main_task(
 ) -> Option<PartialRemoteSegment> {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;
    let mut first_iteration = true;
    let (_, persistent_state) = tli.get_state().await;
    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
@@ -419,6 +395,15 @@ pub async fn main_task(
            }
        }
        // smoothing the load after restart, by sleeping for a random time.
        // if this is not the first iteration, we will wait for the full await_duration
        let await_duration = if first_iteration {
            first_iteration = false;
            rand_duration(&await_duration)
        } else {
            await_duration
        };
        // fixing the segno and waiting some time to prevent reuploading the same segment too often
        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
        let timeout = tokio::time::sleep(await_duration);
@@ -454,7 +439,7 @@ pub async fn main_task(
        }
        // limit concurrent uploads
-        let _upload_permit = limiter.acquire_owned().await;
+        let _upload_permit = limiter.acquire_partial_backup().await;
        let prepared = backup.prepare_upload().await;
        if let Some(seg) = &uploaded_segment {
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -67,6 +67,7 @@ FALLBACK_DURATION = {
    "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
    "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
    "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -18,6 +18,7 @@ anyhow.workspace = true
 aws-config.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
@@ -31,6 +32,7 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
@@ -44,7 +46,12 @@ scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
+diesel = { version = "2.1.4", features = [
    "serde_json",
    "postgres",
    "r2d2",
    "chrono",
 ] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
@@ -52,4 +59,3 @@ utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
@@ -0,0 +1 @@
 DROP TABLE metadata_health;
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
@@ -0,0 +1,14 @@
 CREATE TABLE metadata_health (
  tenant_id VARCHAR NOT NULL,
  shard_number INTEGER NOT NULL,
  shard_count INTEGER NOT NULL,
  PRIMARY KEY(tenant_id, shard_number, shard_count),
  -- Rely on cascade behavior for delete
  FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
  healthy BOOLEAN NOT NULL DEFAULT TRUE,
  last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
 SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -10,7 +10,11 @@ use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
-use pageserver_api::controller_api::TenantCreateRequest;
+use pageserver_api::controller_api::{
    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
    TenantCreateRequest,
 };
 use pageserver_api::models::{
    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -560,6 +564,51 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }
 async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Scrubber)?;
    let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
    let state = get_state(&req);
    state.service.metadata_health_update(update_req).await?;
    json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
 }
 async fn handle_metadata_health_list_unhealthy(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    let state = get_state(&req);
    let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
    json_response(
        StatusCode::OK,
        MetadataHealthListUnhealthyResponse {
            unhealthy_tenant_shards,
        },
    )
 }
 async fn handle_metadata_health_list_outdated(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
    let state = get_state(&req);
    let health_records = state
        .service
        .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
        .await?;
    json_response(
        StatusCode::OK,
        MetadataHealthListOutdatedResponse { health_records },
    )
 }
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -987,6 +1036,28 @@ pub fn make_router(
                RequestName("control_v1_cancel_node_fill"),
            )
        })
        // Metadata health operations
        .post("/control/v1/metadata_health/update", |r| {
            named_request_span(
                r,
                handle_metadata_health_update,
                RequestName("control_v1_metadata_health_update"),
            )
        })
        .get("/control/v1/metadata_health/unhealthy", |r| {
            named_request_span(
                r,
                handle_metadata_health_list_unhealthy,
                RequestName("control_v1_metadata_health_list_unhealthy"),
            )
        })
        .post("/control/v1/metadata_health/outdated", |r| {
            named_request_span(
                r,
                handle_metadata_health_list_outdated,
                RequestName("control_v1_metadata_health_list_outdated"),
            )
        })
        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,12 +9,14 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
    RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
@@ -86,6 +88,10 @@ struct Cli {
    // TODO: make `cfg(feature = "testing")`
    #[arg(long)]
    neon_local_repo_dir: Option<PathBuf>,
    /// Chaos testing
    #[arg(long)]
    chaos_interval: Option<humantime::Duration>,
 }
 enum StrictMode {
@@ -309,6 +315,22 @@ async fn async_main() -> anyhow::Result<()> {
    tracing::info!("Serving on {0}", args.listen);
    let server_task = tokio::task::spawn(server);
    let chaos_task = args.chaos_interval.map(|interval| {
        let service = service.clone();
        let cancel = CancellationToken::new();
        let cancel_bg = cancel.clone();
        (
            tokio::task::spawn(
                async move {
                    let mut chaos_injector = ChaosInjector::new(service, interval.into());
                    chaos_injector.run(cancel_bg).await
                }
                .instrument(tracing::info_span!("chaos_injector")),
            ),
            cancel,
        )
    });
    // Wait until we receive a signal
    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
@@ -337,6 +359,12 @@ async fn async_main() -> anyhow::Result<()> {
        }
    }
    // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
    if let Some((chaos_jh, chaos_cancel)) = chaos_task {
        chaos_cancel.cancel();
        chaos_jh.await.ok();
    }
    service.shutdown().await;
    tracing::info!("Service shutdown complete");
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
@@ -90,6 +91,10 @@ pub(crate) enum DatabaseOperation {
    UpdateTenantShard,
    DeleteTenant,
    UpdateTenantConfig,
    UpdateMetadataHealth,
    ListMetadataHealth,
    ListMetadataHealthUnhealthy,
    ListMetadataHealthOutdated,
 }
 #[must_use]
@@ -307,15 +312,32 @@ impl Persistence {
        &self,
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
+        use crate::schema::metadata_health;
        use crate::schema::tenant_shards;
        let now = chrono::Utc::now();
        let metadata_health_records = shards
            .iter()
            .map(|t| MetadataHealthPersistence {
                tenant_id: t.tenant_id.clone(),
                shard_number: t.shard_number,
                shard_count: t.shard_count,
                healthy: true,
                last_scrubbed_at: now,
            })
            .collect::<Vec<_>>();
        self.with_measured_conn(
            DatabaseOperation::InsertTenantShards,
            move |conn| -> DatabaseResult<()> {
-                for tenant in &shards {
+                diesel::insert_into(tenant_shards::table)
-                    diesel::insert_into(tenant_shards)
+                    .values(&shards)
-                        .values(tenant)
+                    .execute(conn)?;
-                        .execute(conn)?;
+
-                }
+                diesel::insert_into(metadata_health::table)
                    .values(&metadata_health_records)
                    .execute(conn)?;
                Ok(())
            },
        )
@@ -329,10 +351,10 @@ impl Persistence {
        self.with_measured_conn(
            DatabaseOperation::DeleteTenant,
            move |conn| -> DatabaseResult<()> {
                // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                diesel::delete(tenant_shards)
                    .filter(tenant_id.eq(del_tenant_id.to_string()))
                    .execute(conn)?;
                Ok(())
            },
        )
@@ -675,6 +697,94 @@ impl Persistence {
        )
        .await
    }
    /// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
    ///
    /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
    #[allow(dead_code)]
    pub(crate) async fn update_metadata_health_records(
        &self,
        healthy_records: Vec<MetadataHealthPersistence>,
        unhealthy_records: Vec<MetadataHealthPersistence>,
        now: chrono::DateTime<chrono::Utc>,
    ) -> DatabaseResult<()> {
        use crate::schema::metadata_health::dsl::*;
        self.with_measured_conn(
            DatabaseOperation::UpdateMetadataHealth,
            move |conn| -> DatabaseResult<_> {
                diesel::insert_into(metadata_health)
                    .values(&healthy_records)
                    .on_conflict((tenant_id, shard_number, shard_count))
                    .do_update()
                    .set((healthy.eq(true), last_scrubbed_at.eq(now)))
                    .execute(conn)?;
                diesel::insert_into(metadata_health)
                    .values(&unhealthy_records)
                    .on_conflict((tenant_id, shard_number, shard_count))
                    .do_update()
                    .set((healthy.eq(false), last_scrubbed_at.eq(now)))
                    .execute(conn)?;
                Ok(())
            },
        )
        .await
    }
    /// Lists all the metadata health records.
    #[allow(dead_code)]
    pub(crate) async fn list_metadata_health_records(
        &self,
    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
        self.with_measured_conn(
            DatabaseOperation::ListMetadataHealth,
            move |conn| -> DatabaseResult<_> {
                Ok(
                    crate::schema::metadata_health::table
                        .load::<MetadataHealthPersistence>(conn)?,
                )
            },
        )
        .await
    }
    /// Lists all the metadata health records that is unhealthy.
    #[allow(dead_code)]
    pub(crate) async fn list_unhealthy_metadata_health_records(
        &self,
    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
        use crate::schema::metadata_health::dsl::*;
        self.with_measured_conn(
            DatabaseOperation::ListMetadataHealthUnhealthy,
            move |conn| -> DatabaseResult<_> {
                Ok(crate::schema::metadata_health::table
                    .filter(healthy.eq(false))
                    .load::<MetadataHealthPersistence>(conn)?)
            },
        )
        .await
    }
    /// Lists all the metadata health records that have not been updated since an `earlier` time.
    #[allow(dead_code)]
    pub(crate) async fn list_outdated_metadata_health_records(
        &self,
        earlier: chrono::DateTime<chrono::Utc>,
    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
        use crate::schema::metadata_health::dsl::*;
        self.with_measured_conn(
            DatabaseOperation::ListMetadataHealthOutdated,
            move |conn| -> DatabaseResult<_> {
                let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
                let res = query.load::<MetadataHealthPersistence>(conn)?;
                Ok(res)
            },
        )
        .await
    }
 }
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -744,3 +854,59 @@ pub(crate) struct NodePersistence {
    pub(crate) listen_pg_addr: String,
    pub(crate) listen_pg_port: i32,
 }
 /// Tenant metadata health status that are stored durably.
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::metadata_health)]
 pub(crate) struct MetadataHealthPersistence {
    #[serde(default)]
    pub(crate) tenant_id: String,
    #[serde(default)]
    pub(crate) shard_number: i32,
    #[serde(default)]
    pub(crate) shard_count: i32,
    pub(crate) healthy: bool,
    pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
 }
 impl MetadataHealthPersistence {
    pub fn new(
        tenant_shard_id: TenantShardId,
        healthy: bool,
        last_scrubbed_at: chrono::DateTime<chrono::Utc>,
    ) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_number = tenant_shard_id.shard_number.0 as i32;
        let shard_count = tenant_shard_id.shard_count.literal() as i32;
        MetadataHealthPersistence {
            tenant_id,
            shard_number,
            shard_count,
            healthy,
            last_scrubbed_at,
        }
    }
    #[allow(dead_code)]
    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
        Ok(TenantShardId {
            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
            shard_number: ShardNumber(self.shard_number as u8),
            shard_count: ShardCount::new(self.shard_count as u8),
        })
    }
 }
 impl From<MetadataHealthPersistence> for MetadataHealthRecord {
    fn from(value: MetadataHealthPersistence) -> Self {
        MetadataHealthRecord {
            tenant_shard_id: value
                .get_tenant_shard_id()
                .expect("stored tenant id should be valid"),
            healthy: value.healthy,
            last_scrubbed_at: value.last_scrubbed_at,
        }
    }
 }
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -656,11 +656,8 @@ impl Reconciler {
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.
-                    // The general case is to increment the generation.  However, there are cases
+                    // Incrementing generation is the safe general case, but is inefficient for changes that only
-                    // where this is not necessary:
+                    // modify some details (e.g. the tenant's config).
                    // - if we are only updating the TenantConf part of the location
                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
                    //   and the location was already in the correct generation
                    let increment_generation = match observed {
                        None => true,
                        Some(ObservedStateLocation { conf: None }) => true,
@@ -669,18 +666,11 @@ impl Reconciler {
                        }) => {
                            let generations_match = observed.generation == wanted_conf.generation;
-                            use LocationConfigMode::*;
+                            // We may skip incrementing the generation if the location is already in the expected mode and
-                            let mode_transition_requires_gen_inc =
+                            // generation.  In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
-                                match (observed.mode, wanted_conf.mode) {
+                            // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
-                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                            // after a restart/crash, so fall back to the universally safe path of incrementing generation.
-                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                            !generations_match || (observed.mode != wanted_conf.mode)
                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
                                    (AttachedSingle, AttachedStale) => false,
                                    (AttachedMulti, AttachedSingle) => false,
                                    (lhs, rhs) => lhs != rhs,
                                };
                            !generations_match || mode_transition_requires_gen_inc
                        }
                    };
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,5 +1,15 @@
 // @generated automatically by Diesel CLI.
 diesel::table! {
    metadata_health (tenant_id, shard_number, shard_count) {
        tenant_id -> Varchar,
        shard_number -> Int4,
        shard_count -> Int4,
        healthy -> Bool,
        last_scrubbed_at -> Timestamptz,
    }
 }
 diesel::table! {
    nodes (node_id) {
        node_id -> Int8,
@@ -26,4 +36,4 @@ diesel::table! {
    }
 }
-diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -16,7 +16,7 @@ use crate::{
    compute_hook::NotifyError,
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
    metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, TenantFilter},
+    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
@@ -33,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateResponse, UtilizationScore,
+        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -84,6 +84,8 @@ use crate::{
 };
 use serde::{Deserialize, Serialize};
 pub mod chaos_injector;
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -6095,6 +6097,68 @@ impl Service {
        Ok(())
    }
    /// Updates scrubber metadata health check results.
    pub(crate) async fn metadata_health_update(
        &self,
        update_req: MetadataHealthUpdateRequest,
    ) -> Result<(), ApiError> {
        let now = chrono::offset::Utc::now();
        let (healthy_records, unhealthy_records) = {
            let locked = self.inner.read().unwrap();
            let healthy_records = update_req
                .healthy_tenant_shards
                .into_iter()
                // Retain only health records associated with tenant shards managed by storage controller.
                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
                .collect();
            let unhealthy_records = update_req
                .unhealthy_tenant_shards
                .into_iter()
                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
                .collect();
            (healthy_records, unhealthy_records)
        };
        self.persistence
            .update_metadata_health_records(healthy_records, unhealthy_records, now)
            .await?;
        Ok(())
    }
    /// Lists the tenant shards that has unhealthy metadata status.
    pub(crate) async fn metadata_health_list_unhealthy(
        &self,
    ) -> Result<Vec<TenantShardId>, ApiError> {
        let result = self
            .persistence
            .list_unhealthy_metadata_health_records()
            .await?
            .iter()
            .map(|p| p.get_tenant_shard_id().unwrap())
            .collect();
        Ok(result)
    }
    /// Lists the tenant shards that have not been scrubbed for some duration.
    pub(crate) async fn metadata_health_list_outdated(
        &self,
        not_scrubbed_for: Duration,
    ) -> Result<Vec<MetadataHealthRecord>, ApiError> {
        let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
        let result = self
            .persistence
            .list_outdated_metadata_health_records(earlier)
            .await?
            .into_iter()
            .map(|record| record.into())
            .collect();
        Ok(result)
    }
    pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
        self.inner.read().unwrap().get_leadership_status()
    }
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -0,0 +1,71 @@
 use std::{sync::Arc, time::Duration};
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 use tokio_util::sync::CancellationToken;
 use super::Service;
 pub struct ChaosInjector {
    service: Arc<Service>,
    interval: Duration,
 }
 impl ChaosInjector {
    pub fn new(service: Arc<Service>, interval: Duration) -> Self {
        Self { service, interval }
    }
    pub async fn run(&mut self, cancel: CancellationToken) {
        let mut interval = tokio::time::interval(self.interval);
        loop {
            tokio::select! {
                _ = interval.tick() => {}
                _ = cancel.cancelled() => {
                    tracing::info!("Shutting down");
                    return;
                }
            }
            self.inject_chaos().await;
            tracing::info!("Chaos iteration...");
        }
    }
    async fn inject_chaos(&mut self) {
        // Pick some shards to interfere with
        let batch_size = 128;
        let mut inner = self.service.inner.write().unwrap();
        let (nodes, tenants, scheduler) = inner.parts_mut();
        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
        for victim in victims {
            let shard = tenants
                .get_mut(victim)
                .expect("Held lock between choosing ID and this get");
            // Pick a secondary to promote
            let Some(new_location) = shard
                .intent
                .get_secondary()
                .choose(&mut thread_rng())
                .cloned()
            else {
                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
                continue;
            };
            let Some(old_location) = *shard.intent.get_attached() else {
                tracing::info!("Skipping shard {victim}: currently has no attached location");
                continue;
            };
            shard.intent.demote_attached(scheduler, old_location);
            shard.intent.promote_attached(scheduler, new_location);
            self.service.maybe_reconcile_shard(shard, nodes);
        }
    }
 }
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -10,6 +10,7 @@ aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
 git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -40,6 +40,11 @@ impl TimelineAnalysis {
            garbage_keys: Vec::new(),
        }
    }
    /// Whether a timeline is healthy.
    pub(crate) fn is_healthy(&self) -> bool {
        self.errors.is_empty() && self.warnings.is_empty()
    }
 }
 pub(crate) async fn branch_cleanup_and_check_errors(
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,10 +1,13 @@
 use std::pin::pin;
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
 use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};
 use crate::{
-    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
+    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
-    metadata_stream::stream_tenants, BucketConfig, NodeKind,
+    stream_objects_with_retries, BucketConfig, NodeKind,
 };
 #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -47,45 +50,38 @@ pub async fn find_large_objects(
    ignore_deltas: bool,
    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let (remote_client, target) =
-    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
    let objects_stream = tenants.map_ok(|tenant_shard_id| {
        let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        let s3_client = s3_client.clone();
+        let remote_client = remote_client.clone();
        async move {
            let mut objects = Vec::new();
            let mut total_objects_ctr = 0u64;
            // We want the objects and not just common prefixes
            tenant_root.delimiter.clear();
-            let mut continuation_token = None;
+            let mut objects_stream = pin!(stream_objects_with_retries(
-            loop {
+                &remote_client,
-                let fetch_response =
+                ListingMode::NoDelimiter,
-                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                &tenant_root
-                        .await?;
+            ));
-                for obj in fetch_response.contents().iter().filter(|o| {
+            while let Some(listing) = objects_stream.next().await {
-                    if let Some(obj_size) = o.size {
+                let listing = listing?;
-                        min_size as i64 <= obj_size
+                for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
-                    } else {
+                    let key = obj.key.to_string();
                        false
                    }
                }) {
                    let key = obj.key().expect("couldn't get key").to_owned();
                    let kind = LargeObjectKind::from_key(&key);
                    if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
                        continue;
                    }
                    objects.push(LargeObject {
                        key,
-                        size: obj.size.unwrap() as u64,
+                        size: obj.size,
                        kind,
                    })
                }
-                total_objects_ctr += fetch_response.contents().len() as u64;
+                total_objects_ctr += listing.keys.len() as u64;
                match fetch_response.next_continuation_token {
                    Some(new_token) => continuation_token = Some(new_token),
                    None => break,
                }
            }
            Ok((tenant_shard_id, objects, total_objects_ctr))
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -5,6 +5,7 @@
 use std::{
    collections::{HashMap, HashSet},
    sync::Arc,
    time::Duration,
 };
 use anyhow::Context;
@@ -18,8 +19,8 @@ use utils::id::TenantId;
 use crate::{
    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote, init_remote_generic,
+    init_remote_generic, list_objects_with_retries_generic,
-    metadata_stream::{stream_tenant_timelines, stream_tenants},
+    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
@@ -27,6 +28,11 @@ use crate::{
 enum GarbageReason {
    DeletedInConsole,
    MissingInConsole,
    // The remaining data relates to a known deletion issue, and we're sure that purging this
    // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
    // there is nothing in a tenant path apart from a heatmap file.
    KnownBug,
 }
 #[derive(Serialize, Deserialize, Debug)]
@@ -72,6 +78,15 @@ impl GarbageList {
        }
    }
    /// If an entity has been identified as requiring purge due to a known bug, e.g.
    /// a particular type of object left behind after an incomplete deletion.
    fn append_buggy(&mut self, entity: GarbageEntity) {
        self.items.push(GarbageItem {
            entity,
            reason: GarbageReason::KnownBug,
        });
    }
    /// Return true if appended, false if not.  False means the result was not garbage.
    fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
    where
@@ -138,7 +153,7 @@ async fn find_garbage_inner(
    node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
    // Construct clients for S3 and for Console API
-    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
    // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -164,7 +179,7 @@ async fn find_garbage_inner(
    // Enumerate Tenants in S3, and check if each one exists in Console
    tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants(&s3_client, &target);
+    let tenants = stream_tenants_generic(&remote_client, &target);
    let tenants_checked = tenants.map_ok(|t| {
        let api_client = cloud_admin_api_client.clone();
        let console_cache = console_cache.clone();
@@ -219,6 +234,66 @@ async fn find_garbage_inner(
            assert!(project.tenant == tenant_shard_id.tenant_id);
        }
        // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
        // identify it as purge-able anyway
        if console_result.is_none() {
            let timelines =
                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
                    .await?
                    .collect::<Vec<_>>()
                    .await;
            if timelines.is_empty() {
                // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
                let tenant_objects = list_objects_with_retries_generic(
                    &remote_client,
                    ListingMode::WithDelimiter,
                    &target.tenant_root(&tenant_shard_id),
                )
                .await?;
                let object = tenant_objects.keys.first().unwrap();
                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
                    continue;
                } else {
                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
                }
            } else {
                // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
                // rollout of WAL DR in which we never deleted these.
                let mut any_non_initdb = false;
                for timeline_r in timelines {
                    let timeline = timeline_r?;
                    let timeline_objects = list_objects_with_retries_generic(
                        &remote_client,
                        ListingMode::WithDelimiter,
                        &target.timeline_root(&timeline),
                    )
                    .await?;
                    if !timeline_objects.prefixes.is_empty() {
                        // Sub-paths?  Unexpected
                        any_non_initdb = true;
                    } else {
                        let object = timeline_objects.keys.first().unwrap();
                        if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
                            tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
                        } else {
                            any_non_initdb = true;
                        }
                    }
                }
                if any_non_initdb {
                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
                } else {
                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
                    continue;
                }
            }
        }
        if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
            tracing::debug!("Tenant {tenant_shard_id} is garbage");
        } else {
@@ -256,7 +331,8 @@ async fn find_garbage_inner(
    // Construct a stream of all timelines within active tenants
    let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
+    let timelines =
        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
    let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
    let timelines = timelines.try_flatten();
@@ -349,9 +425,6 @@ pub async fn get_timeline_objects(
    tracing::debug!("Listing objects in timeline {ttid}");
    let timeline_root = super::remote_timeline_path_id(&ttid);
    // TODO: apply extra validation based on object modification time.  Don't purge
    // timelines whose index_part.json has been touched recently.
    let list = s3_client
        .list(
            Some(&timeline_root),
@@ -422,6 +495,7 @@ impl DeletionProgressTracker {
 pub async fn purge_garbage(
    input_path: String,
    mode: PurgeMode,
    min_age: Duration,
    dry_run: bool,
 ) -> anyhow::Result<()> {
    let list_bytes = tokio::fs::read(&input_path).await?;
@@ -432,7 +506,7 @@ pub async fn purge_garbage(
        input_path
    );
-    let remote_client =
+    let (remote_client, _target) =
        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
    assert_eq!(
@@ -459,6 +533,7 @@ pub async fn purge_garbage(
        .filter(|i| match (&mode, &i.reason) {
            (PurgeMode::DeletedAndMissing, _) => true,
            (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
            (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
            (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
        });
@@ -487,6 +562,37 @@ pub async fn purge_garbage(
    let mut progress_tracker = DeletionProgressTracker::default();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
        // Extra safety check: even if a collection of objects is garbage, check max() of modification
        // times before purging, so that if we incorrectly marked a live tenant as garbage then we would
        // notice that its index has been written recently and would omit deleting it.
        if object_list.is_empty() {
            // Simplify subsequent code by ensuring list always has at least one item
            // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
            continue;
        }
        let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
        let age = max_mtime.elapsed();
        match age {
            Err(_) => {
                tracing::warn!("Bad last_modified time");
                continue;
            }
            Ok(a) if a < min_age => {
                // Failed age check.  This doesn't mean we did something wrong: a tenant might really be garbage and recently
                // written, but out of an abundance of caution we still don't purge it.
                tracing::info!(
                    "Skipping tenant with young objects {}..{}",
                    object_list.first().as_ref().unwrap().key,
                    object_list.last().as_ref().unwrap().key
                );
                continue;
            }
            Ok(_) => {
                // Passed age check
            }
        }
        objects_to_delete.append(&mut object_list);
        if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
            do_delete(
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -16,22 +16,26 @@ use std::sync::Arc;
 use std::time::Duration;
 use anyhow::{anyhow, Context};
 use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
 use aws_sdk_s3::Client;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use futures::{Stream, StreamExt};
 use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -253,6 +257,12 @@ pub struct ControllerClientConfig {
    pub controller_jwt: String,
 }
 impl ControllerClientConfig {
    pub fn build_client(self) -> control_api::Client {
        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
    }
 }
 pub struct ConsoleConfig {
    pub token: String,
    pub base_url: Url,
@@ -305,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
 }
 async fn init_s3_client(bucket_region: Region) -> Client {
    let mut retry_config_builder = RetryConfigBuilder::new();
    retry_config_builder
        .set_max_attempts(Some(3))
        .set_mode(Some(RetryMode::Adaptive));
    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
        .region(bucket_region)
        .retry_config(retry_config_builder.build())
        .load()
        .await;
    Client::new(&config)
@@ -319,27 +336,35 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
    }
 }
 fn make_root_target(
    bucket_name: String,
    prefix_in_bucket: String,
    node_kind: NodeKind,
 ) -> RootTarget {
    let s3_target = S3Target {
        bucket_name,
        prefix_in_bucket,
        delimiter: "/".to_string(),
    };
    match node_kind {
        NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
        NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
    }
 }
 async fn init_remote(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
    let bucket_region = Region::new(bucket_config.region);
    let delimiter = "/".to_string();
    let s3_client = Arc::new(init_s3_client(bucket_region).await);
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
-    let s3_root = match node_kind {
+    let s3_root = make_root_target(
-        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
+        bucket_config.bucket,
-            bucket_name: bucket_config.bucket,
+        bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+        node_kind,
-            delimiter,
+    );
        }),
        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
            bucket_name: bucket_config.bucket,
            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
            delimiter,
        }),
    };
    Ok((s3_client, s3_root))
 }
@@ -347,12 +372,12 @@ async fn init_remote(
 async fn init_remote_generic(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
-) -> anyhow::Result<GenericRemoteStorage> {
+) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
    let endpoint = env::var("AWS_ENDPOINT_URL").ok();
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
    let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
    let storage = S3Config {
-        bucket_name: bucket_config.bucket,
+        bucket_name: bucket_config.bucket.clone(),
        bucket_region: bucket_config.region,
        prefix_in_bucket,
        endpoint,
@@ -366,7 +391,13 @@ async fn init_remote_generic(
        storage: RemoteStorageKind::AwsS3(storage),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
-    GenericRemoteStorage::from_config(&storage_config).await
+
    // We already pass the prefix to the remote client above
    let prefix_in_root_target = String::new();
    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
    let client = GenericRemoteStorage::from_config(&storage_config).await?;
    Ok((client, s3_root))
 }
 async fn list_objects_with_retries(
@@ -404,6 +435,84 @@ async fn list_objects_with_retries(
    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 /// Listing possibly large amounts of keys in a streaming fashion.
 fn stream_objects_with_retries<'a>(
    storage_client: &'a GenericRemoteStorage,
    listing_mode: ListingMode,
    s3_target: &'a S3Target,
 ) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
    async_stream::stream! {
        let mut trial = 0;
        let cancel = CancellationToken::new();
        let prefix_str = &s3_target
            .prefix_in_bucket
            .strip_prefix("/")
            .unwrap_or(&s3_target.prefix_in_bucket);
        let prefix = RemotePath::from_string(prefix_str)?;
        let mut list_stream =
            storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
        while let Some(res) = list_stream.next().await {
            if let Err(err) = res {
                let yield_err = if err.is_permanent() {
                    true
                } else {
                    let backoff_time = 1 << trial.max(5);
                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                    trial += 1;
                    trial == MAX_RETRIES - 1
                };
                if yield_err {
                    yield Err(err)
                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
                    break;
                }
            } else {
                trial = 0;
                yield res.map_err(anyhow::Error::from);
            }
        }
    }
 }
 /// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
 /// use [`stream_objects_with_retries`] instead.
 async fn list_objects_with_retries_generic(
    remote_client: &GenericRemoteStorage,
    listing_mode: ListingMode,
    s3_target: &S3Target,
 ) -> anyhow::Result<Listing> {
    let cancel = CancellationToken::new();
    let prefix_str = &s3_target
        .prefix_in_bucket
        .strip_prefix("/")
        .unwrap_or(&s3_target.prefix_in_bucket);
    let prefix = RemotePath::from_string(prefix_str)?;
    for trial in 0..MAX_RETRIES {
        match remote_client
            .list(Some(&prefix), listing_mode, None, &cancel)
            .await
        {
            Ok(response) => return Ok(response),
            Err(e) => {
                if trial == MAX_RETRIES - 1 {
                    return Err(e)
                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
                }
                error!(
                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
                    s3_target.bucket_name,
                    s3_target.prefix_in_bucket,
                    s3_target.delimiter,
                    DisplayErrorContext(e),
                );
                let backoff_time = 1 << trial.max(5);
                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
            }
        }
    }
    panic!("MAX_RETRIES is not allowed to be 0");
 }
 async fn download_object_with_retries(
    s3_client: &Client,
    bucket_name: &str,
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,7 +1,8 @@
 use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
-use reqwest::Url;
+use reqwest::{Method, Url};
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -16,6 +17,11 @@ use storage_scrubber::{
 use clap::{Parser, Subcommand};
 use utils::id::TenantId;
 use utils::{project_build_tag, project_git_version};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -50,6 +56,8 @@ enum Command {
        input_path: String,
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
        #[arg(long = "min-age")]
        min_age: humantime::Duration,
    },
    #[command(verbatim_doc_comment)]
    ScanMetadata {
@@ -59,6 +67,8 @@ enum Command {
        json: bool,
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
        #[arg(long = "post", default_value_t = false)]
        post_to_storage_controller: bool,
        #[arg(long, default_value = None)]
        /// For safekeeper node_kind only, points to db with debug dump
        dump_db_connstr: Option<String>,
@@ -96,6 +106,8 @@ enum Command {
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
    let bucket_config = BucketConfig::from_env()?;
    let command_log_name = match &cli.command {
@@ -114,11 +126,20 @@ async fn main() -> anyhow::Result<()> {
        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
    ));
    let controller_client_conf = cli.controller_api.map(|controller_api| {
        ControllerClientConfig {
            controller_api,
            // Default to no key: this is a convenience when working in a development environment
            controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
        }
    });
    match cli.command {
        Command::ScanMetadata {
            json,
            tenant_ids,
            node_kind,
            post_to_storage_controller,
            dump_db_connstr,
            dump_db_table,
        } => {
@@ -157,6 +178,9 @@ async fn main() -> anyhow::Result<()> {
                }
                Ok(())
            } else {
                if controller_client_conf.is_none() && post_to_storage_controller {
                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
                }
                match scan_metadata(bucket_config.clone(), tenant_ids).await {
                    Err(e) => {
                        tracing::error!("Failed: {e}");
@@ -168,6 +192,21 @@ async fn main() -> anyhow::Result<()> {
                        } else {
                            println!("{}", summary.summary_string());
                        }
                        if post_to_storage_controller {
                            if let Some(conf) = controller_client_conf {
                                let controller_client = conf.build_client();
                                let body = summary.build_health_update_request();
                                controller_client
                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
                                        Method::POST,
                                        "control/v1/metadata_health/update".to_string(),
                                        Some(body),
                                    )
                                    .await?;
                            }
                        }
                        if summary.is_fatal() {
                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
                        } else if summary.is_empty() {
@@ -196,9 +235,11 @@ async fn main() -> anyhow::Result<()> {
            let console_config = ConsoleConfig::from_env()?;
            find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
        }
-        Command::PurgeGarbage { input_path, mode } => {
+        Command::PurgeGarbage {
-            purge_garbage(input_path, mode, !cli.delete).await
+            input_path,
-        }
+            mode,
            min_age,
        } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
        Command::TenantSnapshot {
            tenant_id,
            output_path,
@@ -213,14 +254,6 @@ async fn main() -> anyhow::Result<()> {
            min_age,
            mode,
        } => {
            let controller_client_conf = cli.controller_api.map(|controller_api| {
                ControllerClientConfig {
                    controller_api,
                    // Default to no key: this is a convenience when working in a development environment
                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
                }
            });
            match (&controller_client_conf, mode) {
                (Some(_), _) => {
                    // Any mode may run when controller API is set
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -1,12 +1,41 @@
-use anyhow::Context;
+use std::str::FromStr;
 use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use futures::StreamExt;
 use remote_storage::{GenericRemoteStorage, ListingMode};
 use tokio_stream::Stream;
-use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
+use crate::{
    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
    TenantShardTimelineId,
 };
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
 /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
 pub fn stream_tenants_generic<'a>(
    remote_client: &'a GenericRemoteStorage,
    target: &'a RootTarget,
 ) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
    try_stream! {
        let tenants_target = target.tenants_root();
        let mut tenants_stream =
            std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
        while let Some(chunk) = tenants_stream.next().await {
            let chunk = chunk?;
            let entry_ids = chunk.prefixes.iter()
                .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
            for dir_name_res in entry_ids {
                let dir_name = dir_name_res?;
                let id = TenantShardId::from_str(dir_name)?;
                yield id;
            }
        }
    }
 }
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
    s3_client: &'a Client,
@@ -160,6 +189,63 @@ pub async fn stream_tenant_timelines<'a>(
    })
 }
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
 /// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines_generic<'a>(
    remote_client: &'a GenericRemoteStorage,
    target: &'a RootTarget,
    tenant: TenantShardId,
 ) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
    let timelines_target = target.timelines_root(&tenant);
    let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
        remote_client,
        ListingMode::WithDelimiter,
        &timelines_target
    ));
    loop {
        tracing::debug!("Listing in {tenant}");
        let fetch_response = match objects_stream.next().await {
            None => break,
            Some(Err(e)) => {
                timeline_ids.push(Err(e));
                break;
            }
            Some(Ok(r)) => r,
        };
        let new_entry_ids = fetch_response
            .prefixes
            .iter()
            .filter_map(|prefix| -> Option<&str> {
                prefix
                    .get_path()
                    .as_str()
                    .strip_prefix(&timelines_target.prefix_in_bucket)?
                    .strip_suffix('/')
            })
            .map(|entry_id_str| {
                entry_id_str
                    .parse::<TimelineId>()
                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
            });
        for i in new_entry_ids {
            timeline_ids.push(i);
        }
    }
    tracing::debug!("Yielding for {}", tenant);
    Ok(stream! {
        for i in timeline_ids {
            let id = i?;
            yield Ok(TenantShardTimelineId::new(tenant, id));
        }
    })
 }
 pub(crate) fn stream_listing<'a>(
    s3_client: &'a Client,
    target: &'a S3Target,
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -567,13 +567,7 @@ pub async fn pageserver_physical_gc(
    }
    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
+    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
        let ControllerClientConfig {
            controller_api,
            controller_jwt,
        } = c;
        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
    }) else {
        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
        return Ok(summary);
    };
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;
-#[derive(Serialize)]
+#[derive(Serialize, Default)]
 pub struct MetadataSummary {
    tenant_count: usize,
    timeline_count: usize,
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
    with_warnings: HashSet<TenantShardTimelineId>,
    with_orphans: HashSet<TenantShardTimelineId>,
    indices_by_version: HashMap<usize, usize>,
    #[serde(skip)]
    pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
    #[serde(skip)]
    pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
 }
 impl MetadataSummary {
    fn new() -> Self {
-        Self {
+        Self::default()
            tenant_count: 0,
            timeline_count: 0,
            timeline_shard_count: 0,
            with_errors: HashSet::new(),
            with_warnings: HashSet::new(),
            with_orphans: HashSet::new(),
            indices_by_version: HashMap::new(),
        }
    }
    fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -54,6 +52,13 @@ impl MetadataSummary {
    }
    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
        if analysis.is_healthy() {
            self.healthy_tenant_shards.insert(id.tenant_shard_id);
        } else {
            self.healthy_tenant_shards.remove(&id.tenant_shard_id);
            self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
        }
        if !analysis.errors.is_empty() {
            self.with_errors.insert(*id);
        }
@@ -101,6 +106,13 @@ Index versions: {version_summary}
    pub fn is_empty(&self) -> bool {
        self.timeline_shard_count == 0
    }
    pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
        MetadataHealthUpdateRequest {
            healthy_tenant_shards: self.healthy_tenant_shards.clone(),
            unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
        }
    }
 }
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -150,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_pitr_history_size",
    "pageserver_layer_bytes",
    "pageserver_layer_count",
    "pageserver_visible_physical_size",
    "pageserver_storage_operations_seconds_count_total",
    "pageserver_storage_operations_seconds_sum_total",
    "pageserver_evictions_total",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -449,6 +449,7 @@ class TokenScope(str, Enum):
    GENERATIONS_API = "generations_api"
    SAFEKEEPER_DATA = "safekeeperdata"
    TENANT = "tenant"
    SCRUBBER = "scrubber"
 class NeonEnvBuilder:
@@ -1942,11 +1943,15 @@ class NeonCli(AbstractNeonCli):
        remote_ext_config: Optional[str] = None,
        pageserver_id: Optional[int] = None,
        allow_multiple=False,
        basebackup_request_tries: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
            "start",
        ]
        extra_env_vars = {}
        if basebackup_request_tries is not None:
            extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
        if remote_ext_config is not None:
            args.extend(["--remote-ext-config", remote_ext_config])
@@ -1959,7 +1964,7 @@ class NeonCli(AbstractNeonCli):
        if allow_multiple:
            args.extend(["--allow-multiple"])
-        res = self.raw_cli(args)
+        res = self.raw_cli(args, extra_env_vars)
        res.check_returncode()
        return res
@@ -2586,6 +2591,51 @@ class NeonStorageController(MetricsGetter, LogUtils):
                time.sleep(backoff)
    def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
        body: Dict[str, Any] = {
            "healthy_tenant_shards": [str(t) for t in healthy],
            "unhealthy_tenant_shards": [str(t) for t in unhealthy],
        }
        self.request(
            "POST",
            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
            json=body,
            headers=self.headers(TokenScope.SCRUBBER),
        )
    def metadata_health_list_unhealthy(self):
        response = self.request(
            "GET",
            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
            headers=self.headers(TokenScope.ADMIN),
        )
        return response.json()
    def metadata_health_list_outdated(self, duration: str):
        body: Dict[str, Any] = {"not_scrubbed_for": duration}
        response = self.request(
            "POST",
            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
        return response.json()
    def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
        """Metadata is healthy if there is no unhealthy or outdated health records."""
        unhealthy = self.metadata_health_list_unhealthy()
        outdated = self.metadata_health_list_outdated(outdated_duration)
        healthy = (
            len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
        )
        if not healthy:
            log.info(f"{unhealthy=}, {outdated=}")
        return healthy
    def step_down(self):
        log.info("Asking storage controller to step down")
        response = self.request(
@@ -3766,6 +3816,7 @@ class Endpoint(PgProtocol, LogUtils):
        pageserver_id: Optional[int] = None,
        safekeepers: Optional[List[int]] = None,
        allow_multiple: bool = False,
        basebackup_request_tries: Optional[int] = None,
    ) -> "Endpoint":
        """
        Start the Postgres instance.
@@ -3787,6 +3838,7 @@ class Endpoint(PgProtocol, LogUtils):
            remote_ext_config=remote_ext_config,
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
            basebackup_request_tries=basebackup_request_tries,
        )
        self._running.release(1)
@@ -3933,6 +3985,7 @@ class Endpoint(PgProtocol, LogUtils):
        remote_ext_config: Optional[str] = None,
        pageserver_id: Optional[int] = None,
        allow_multiple=False,
        basebackup_request_tries: Optional[int] = None,
    ) -> "Endpoint":
        """
        Create an endpoint, apply config, and start Postgres.
@@ -3953,6 +4006,7 @@ class Endpoint(PgProtocol, LogUtils):
            remote_ext_config=remote_ext_config,
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
            basebackup_request_tries=basebackup_request_tries,
        )
        log.info(f"Postgres startup took {time.time() - started_at} seconds")
@@ -3996,6 +4050,7 @@ class EndpointFactory:
        config_lines: Optional[List[str]] = None,
        remote_ext_config: Optional[str] = None,
        pageserver_id: Optional[int] = None,
        basebackup_request_tries: Optional[int] = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -4014,6 +4069,7 @@ class EndpointFactory:
            lsn=lsn,
            remote_ext_config=remote_ext_config,
            pageserver_id=pageserver_id,
            basebackup_request_tries=basebackup_request_tries,
        )
    def create(
@@ -4355,10 +4411,11 @@ class StorageScrubber:
        assert stdout is not None
        return stdout
-    def scan_metadata(self) -> Any:
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
-        stdout = self.scrubber_cli(
+        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
-            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
+        if post_to_storage_controller:
-        )
+            args.append("--post")
        stdout = self.scrubber_cli(args, timeout=30)
        try:
            return json.loads(stdout)
@@ -4482,6 +4539,13 @@ def test_output_dir(
    yield test_dir
    # Allure artifacts creation might involve the creation of `.tar.zst` archives,
    # which aren't going to be used if Allure results collection is not enabled
    # (i.e. --alluredir is not set).
    # Skip `allure_attach_from_dir` in this case
    if not request.config.getoption("--alluredir"):
        return
    preserve_database_files = False
    for k, v in request.node.user_properties:
        # NB: the neon_env_builder fixture uses this fixture (test_output_dir).
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -663,6 +663,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        force_image_layer_creation=False,
        wait_until_uploaded=False,
        compact: Optional[bool] = None,
        **kwargs,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
@@ -680,6 +681,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
            params=query,
            **kwargs,
        )
        log.info(f"Got checkpoint request response code: {res.status_code}")
        self.verbose_error(res)
--- a/test_runner/logical_repl/test_log_repl.py
+++ b/test_runner/logical_repl/test_log_repl.py
@@ -0,0 +1,88 @@
 """
 Test the logical replication in Neon with the different consumers
 """
 import hashlib
 import time
 import clickhouse_connect
 import psycopg2
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import RemotePostgres
 from fixtures.utils import wait_until
 def query_clickhouse(
    client,
    query: str,
    digest: str,
 ) -> None:
    """
    Run the query on the client
    return answer if successful, raise an exception otherwise
    """
    log.debug("Query: %s", query)
    res = client.query(query)
    log.debug(res.result_rows)
    m = hashlib.sha1()
    m.update(repr(tuple(res.result_rows)).encode())
    hash_res = m.hexdigest()
    log.debug("Hash: %s", hash_res)
    if hash_res == digest:
        return
    raise ValueError("Hash mismatch")
@pytest.mark.remote_cluster
 def test_clickhouse(remote_pg: RemotePostgres):
    """
    Test the logical replication having ClickHouse as a client
    """
    conn_options = remote_pg.conn_options()
    for _ in range(5):
        try:
            conn = psycopg2.connect(remote_pg.connstr())
        except psycopg2.OperationalError as perr:
            log.debug(perr)
            time.sleep(1)
        else:
            break
        raise TimeoutError
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS table1")
    cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
    cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
    conn.commit()
    client = clickhouse_connect.get_client(host="clickhouse")
    client.command("SET allow_experimental_database_materialized_postgresql=1")
    client.command(
        "CREATE DATABASE db1_postgres ENGINE = "
        f"MaterializedPostgreSQL('{conn_options['host']}', "
        f"'{conn_options['dbname']}', "
        f"'{conn_options['user']}', '{conn_options['password']}') "
        "SETTINGS materialized_postgresql_tables_list = 'table1';"
    )
    wait_until(
        120,
        0.5,
        lambda: query_clickhouse(
            client,
            "select * from db1_postgres.table1 order by 1",
            "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
        ),
    )
    cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
    conn.commit()
    wait_until(
        120,
        0.5,
        lambda: query_clickhouse(
            client,
            "select * from db1_postgres.table1 order by 1",
            "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
        ),
    )
    log.debug("Sleeping before final checking if Neon is still alive")
    time.sleep(3)
    cur.execute("SELECT 1")
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -6,21 +6,8 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-@pytest.mark.timeout(10000)
+def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
-def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    assert mode == "normal" or mode == "with_snapshots"
    """
    Test that GC is able to collect all old layers even if them are forming
    "stairs" and there are not three delta layers since last image layer.
    Information about image layers needed to collect old layers should
    be propagated by GC to compaction task which should take in in account
    when make a decision which new image layers needs to be created.
    NB: this test demonstrates the problem. The source tree contained the
    `gc_feedback` mechanism for about 9 months, but, there were problems
    with it and it wasn't enabled at runtime.
    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
    """
    env = neon_env_builder.init_start()
    client = env.pageserver.http_client()
@@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
            physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
            log.info(f"Physical storage size {physical_size}")
        if mode == "with_snapshots":
            if step == n_steps / 2:
                env.neon_cli.create_branch("child")
    max_num_of_deltas_above_image = 0
    max_total_num_of_deltas = 0
@@ -149,3 +139,37 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
    log.info(f"Writing layer map to {layer_map_path}")
    with layer_map_path.open("w") as f:
        f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
@pytest.mark.timeout(10000)
 def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
    """
    Test that GC is able to collect all old layers even if them are forming
    "stairs" and there are not three delta layers since last image layer.
    Information about image layers needed to collect old layers should
    be propagated by GC to compaction task which should take in in account
    when make a decision which new image layers needs to be created.
    NB: this test demonstrates the problem. The source tree contained the
    `gc_feedback` mechanism for about 9 months, but, there were problems
    with it and it wasn't enabled at runtime.
    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
    And the bottom-most GC-compaction epic resolves the problem.
    https://github.com/neondatabase/neon/issues/8002
    """
    gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
@pytest.mark.timeout(10000)
 def test_gc_feedback_with_snapshots(
    neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
 ):
    """
    Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
    of the benchmark, and the   bottom-most compaction should collect as much garbage as possible below the GC
    horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
    and images covering the full key range (in a delta layer) at the GC horizon.
    """
    gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
 from requests.exceptions import RetryError
 # Test branch creation
@@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
    env.pageserver.allowed_errors.extend(
        [
            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
-            ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
+            ".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline",
        ]
    )
    ps_http = env.pageserver.http_client()
@@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
        env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
-        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
+        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
-            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
+            env.endpoints.create_start(
                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
            )
        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
    finally:
        # FIXME: paused uploads bother shutdown
        env.pageserver.stop(immediate=True)
        t.join()
@@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
    env = neon_env_builder.init_configs()
    env.start()
-    env.pageserver.allowed_errors.append(
+    env.pageserver.allowed_errors.extend(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
+        [
            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*",
        ]
    )
    ps_http = env.pageserver.http_client()
@@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
        branch_id = TimelineId.generate()
-        with pytest.raises(RetryError, match="too many 503 error responses"):
+        with pytest.raises(
            PageserverApiException,
            match="Cannot branch off the timeline that's not present in pageserver",
        ):
            ps_http.timeline_create(
                env.pg_version,
                env.initial_tenant,
@@ -389,6 +396,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
        repeat_result = ps_http.timeline_create(
            env.pg_version, env.initial_tenant, success_timeline, timeout=60
        )
        # remote_consistent_lsn_visible will be published only after we've
        # confirmed the generation, which is not part of what we await during
        # timeline creation (uploads). mask it out here to avoid flakyness.
        del success_result["remote_consistent_lsn_visible"]
        del repeat_result["remote_consistent_lsn_visible"]
        assert repeat_result == success_result
    finally:
        env.pageserver.stop(immediate=True)
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -3,18 +3,15 @@ import re
 import shutil
 import subprocess
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 import pytest
 import toml
-from fixtures.common_types import Lsn
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
@@ -22,7 +19,8 @@ from fixtures.pageserver.utils import (
    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.workload import Workload
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
@@ -409,3 +407,133 @@ def dump_differs(
                    break
    return differs
@dataclass
 class HistoricDataSet:
    name: str
    tenant_id: TenantId
    pg_version: PgVersion
    url: str
    def __str__(self):
        return self.name
 HISTORIC_DATA_SETS = [
    # From before we enabled image layer compression.
    # - IndexPart::LATEST_VERSION 7
    # - STORAGE_FORMAT_VERSION 3
    HistoricDataSet(
        "2024-07-18",
        TenantId("17bf64a53509714687664b3a84e9b3ba"),
        PgVersion.V16,
        "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
    ),
 ]
@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS)
@pytest.mark.xdist_group("compatibility")
 def test_historic_storage_formats(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
    pg_version: PgVersion,
    dataset: HistoricDataSet,
 ):
    """
    This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago.
    """
    ARTIFACT_CACHE_DIR = "./artifact_cache"
    import tarfile
    from contextlib import closing
    import requests
    import zstandard
    artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name)
    # Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by
    # HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version
    # will no longer be covered by this test.
    if pg_version != dataset.pg_version:
        pytest.skip(f"Dataset {dataset} is for different PG version, skipping")
    with closing(requests.get(dataset.url, stream=True)) as r:
        unzstd = zstandard.ZstdDecompressor()
        with unzstd.stream_reader(r.raw) as stream:
            with tarfile.open(mode="r|", fileobj=stream) as tf:
                tf.extractall(artifact_unpack_path)
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.pg_version = dataset.pg_version
    env = neon_env_builder.init_configs()
    env.start()
    assert isinstance(env.pageserver_remote_storage, S3Storage)
    # Link artifact data into test's remote storage.  We don't want the whole repo dir, just the remote storage part: we are not testing
    # compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices.
    #
    # The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket.  We use
    # S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs)
    artifact_pageserver_path = (
        artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver")
    )
    for root, _dirs, files in os.walk(artifact_pageserver_path):
        for file in files:
            local_path = os.path.join(root, file)
            remote_key = (
                env.pageserver_remote_storage.prefix_in_bucket
                + str(local_path)[len(str(artifact_pageserver_path)) :]
            )
            log.info(f"Uploading {local_path} -> {remote_key}")
            env.pageserver_remote_storage.client.upload_file(
                local_path, env.pageserver_remote_storage.bucket_name, remote_key
            )
    # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
    #
    # Do this _before_ importing to the pageserver, as that import may start writing immediately
    metadata_summary = env.storage_scrubber.scan_metadata()
    assert metadata_summary["tenant_count"] >= 1
    assert metadata_summary["timeline_count"] >= 1
    assert not metadata_summary["with_errors"]
    assert not metadata_summary["with_warnings"]
    env.neon_cli.import_tenant(dataset.tenant_id)
    # Discover timelines
    timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id)
    # All our artifacts should contain at least one timeline
    assert len(timelines) > 0
    # TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very
    # least they should include a mixture of deltas and image layers.  Preferably they should also
    # contain some "exotic" stuff like aux files from logical replication.
    # Check we can start an endpoint and read the SQL that the artifact is meant to contain
    reference_sql_dump = artifact_unpack_path / Path("dump.sql")
    ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id)
    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
    pg_bin.run_capture(
        ["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
    )
    assert not dump_differs(
        reference_sql_dump,
        test_output_dir / "dump.sql",
        test_output_dir / "dump.filediff",
    )
    ep.stop()
    # Check we can also do writes to the database
    existing_timeline_id = TimelineId(timelines[0]["timeline_id"])
    workload = Workload(env, dataset.tenant_id, existing_timeline_id)
    workload.init()
    workload.write_rows(100)
    # Check that compaction works
    env.pageserver.http_client().timeline_compact(
        dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True
    )
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
@@ -313,6 +312,7 @@ def test_remote_storage_upload_queue_retries(
    def churn_while_failpoints_active(result):
        overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
        # this call will wait for the failpoints to be turned off
        client.timeline_checkpoint(tenant_id, timeline_id)
        client.timeline_compact(tenant_id, timeline_id)
        overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
@@ -332,8 +332,8 @@ def test_remote_storage_upload_queue_retries(
    # Exponential back-off in upload queue, so, gracious timeouts.
    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
-    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
    # unblock churn operations
    configure_storage_sync_failpoints("off")
@@ -769,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
        create_thread.join()
-def test_compaction_waits_for_upload(
+def test_paused_upload_stalls_checkpoint(
    neon_env_builder: NeonEnvBuilder,
 ):
    """
-    This test forces a race between upload and compaction.
+    This test checks that checkpoints block on uploads to remote storage.
    """
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
@@ -788,6 +788,10 @@ def test_compaction_waits_for_upload(
        }
    )
    env.pageserver.allowed_errors.append(
        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
    )
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
@@ -808,76 +812,9 @@ def test_compaction_waits_for_upload(
        endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-        client.timeline_checkpoint(tenant_id, timeline_id)
+        with pytest.raises(ReadTimeout):
-        deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers())
+            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
-        assert (
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
            deltas_at_first == 2
        ), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement."
        endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)")
        endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
    layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
    upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name()
    assert len(upload_stuck_layers) > 0
    for name in upload_stuck_layers:
        assert env.pageserver.layer_exists(
            tenant_id, timeline_id, parse_layer_file_name(name)
        ), "while uploads are stuck the layers should be present on disk"
    # now this will do the L0 => L1 compaction and want to remove
    # upload_stuck_layers and the original initdb L0
    client.timeline_checkpoint(tenant_id, timeline_id)
    # as uploads are paused, the upload_stuck_layers should still be with us
    for name in upload_stuck_layers:
        assert env.pageserver.layer_exists(
            tenant_id, timeline_id, parse_layer_file_name(name)
        ), "uploads are stuck still over compaction"
    compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
    overlap = compacted_layers.intersection(upload_stuck_layers)
    assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction"
    assert (
        len(compacted_layers) == 1
    ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"
    def layer_deletes_completed():
        m = client.get_metric_value("pageserver_layer_completed_deletes_total")
        if m is None:
            return 0
        return int(m)
    # if initdb created an initial delta layer, it might already be gc'd
    # because it was uploaded before the failpoint was enabled. however, the
    # deletion is not guaranteed to be complete.
    assert layer_deletes_completed() <= 1
    client.configure_failpoints(("before-upload-layer-pausable", "off"))
    # Ensure that this actually terminates
    wait_upload_queue_empty(client, tenant_id, timeline_id)
    def until_layer_deletes_completed():
        deletes = layer_deletes_completed()
        log.info(f"layer_deletes: {deletes}")
        # ensure that initdb delta layer AND the previously stuck are now deleted
        assert deletes >= len(upload_stuck_layers) + 1
    wait_until(10, 1, until_layer_deletes_completed)
    for name in upload_stuck_layers:
        assert not env.pageserver.layer_exists(
            tenant_id, timeline_id, parse_layer_file_name(name)
        ), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
    # We should not have hit the error handling path in uploads where a uploaded file is gone
    assert not env.pageserver.log_contains(
        "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
    )
 def wait_upload_queue_empty(
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3,7 +3,7 @@ import threading
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -1785,6 +1785,126 @@ def test_storage_controller_node_deletion(
    env.storage_controller.consistency_check()
@pytest.mark.parametrize("shard_count", [None, 2])
 def test_storage_controller_metadata_health(
    neon_env_builder: NeonEnvBuilder,
    shard_count: Optional[int],
 ):
    """
    Create three tenants A, B, C.
    Phase 1:
    - A: Post healthy status.
    - B: Post unhealthy status.
    - C: No updates.
    Phase 2:
    - B: Post healthy status.
    - C: Post healthy status.
    Phase 3:
    - A: Post unhealthy status.
    Phase 4:
    - Delete tenant A, metadata health status should be deleted as well.
    """
    def update_and_query_metadata_health(
        env: NeonEnv,
        healthy: List[TenantShardId],
        unhealthy: List[TenantShardId],
        outdated_duration: str = "1h",
    ) -> Tuple[Set[str], Set[str]]:
        """
        Update metadata health. Then list tenant shards with unhealthy and
        outdated metadata health status.
        """
        if healthy or unhealthy:
            env.storage_controller.metadata_health_update(healthy, unhealthy)
        result = env.storage_controller.metadata_health_list_unhealthy()
        unhealthy_res = set(result["unhealthy_tenant_shards"])
        result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
        outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
        return unhealthy_res, outdated_res
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start()
    # Mock tenant (`initial_tenant``) with healthy scrubber scan result
    tenant_a_shard_ids = (
        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
        if shard_count is not None
        else [TenantShardId(env.initial_tenant, 0, 0)]
    )
    # Mock tenant with unhealthy scrubber scan result
    tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
    tenant_b_shard_ids = (
        env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
        if shard_count is not None
        else [TenantShardId(tenant_b, 0, 0)]
    )
    # Mock tenant that never gets a health update from scrubber
    tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
    tenant_c_shard_ids = (
        env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
        if shard_count is not None
        else [TenantShardId(tenant_c, 0, 0)]
    )
    # Metadata health table also updated as tenant shards are created.
    assert env.storage_controller.metadata_health_is_healthy()
    # post "fake" updates to storage controller db
    unhealthy, outdated = update_and_query_metadata_health(
        env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
    )
    log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
    assert len(unhealthy) == len(tenant_b_shard_ids)
    for t in tenant_b_shard_ids:
        assert str(t) in unhealthy
    assert len(outdated) == 0
    unhealthy, outdated = update_and_query_metadata_health(
        env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
    )
    log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
    assert len(unhealthy) == 0
    assert len(outdated) == 0
    unhealthy, outdated = update_and_query_metadata_health(
        env, healthy=[], unhealthy=tenant_a_shard_ids
    )
    log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
    assert len(unhealthy) == len(tenant_a_shard_ids)
    for t in tenant_a_shard_ids:
        assert str(t) in unhealthy
    assert len(outdated) == 0
    # Phase 4: Delete A
    env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
    # A's unhealthy metadata health status should be deleted as well.
    assert env.storage_controller.metadata_health_is_healthy()
    # All shards from B and C are not fresh if set outdated duration to 0 seconds.
    unhealthy, outdated = update_and_query_metadata_health(
        env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
    )
    assert len(unhealthy) == 0
    for t in tenant_b_shard_ids + tenant_c_shard_ids:
        assert str(t) in outdated
 def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
    """
    Test the `/control/v1/step_down` storage controller API. Upon receiving such
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -265,10 +266,85 @@ def test_scrubber_physical_gc_ancestors(
    # attach it, to drop any local state, then check it's still readable.
    workload.stop()
    drop_local_state(env, tenant_id)
    workload.validate()
 def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
    """
    When we delete a timeline after a shard split, the child shards do not directly delete the
    layers in the ancestor shards.  They rely on the scrubber to clean up.
    """
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_configs()
    env.start()
    tenant_id = TenantId.generate()
    timeline_id = TimelineId.generate()
    env.neon_cli.create_tenant(
        tenant_id,
        timeline_id,
        shard_count=None,
        conf={
            # Small layers and low compaction thresholds, so that when we split we can expect some to
            # be dropped by child shards
            "checkpoint_distance": f"{1024 * 1024}",
            "compaction_threshold": "1",
            "compaction_target_size": f"{1024 * 1024}",
            "image_creation_threshold": "2",
            "image_layer_creation_check_threshold": "0",
            # Disable background compaction, we will do it explicitly
            "compaction_period": "0s",
            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
            # and makes them GC'able
            "pitr_interval": "0s",
        },
    )
    # Make sure the original shard has some layers
    workload = Workload(env, tenant_id, timeline_id)
    workload.init()
    workload.write_rows(100)
    new_shard_count = 4
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
    # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
    #
    # This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber
    # doesn't know about it, and won't perceive its ancestors as ancestors.
    other_timeline_id = TimelineId.generate()
    env.storage_controller.pageserver_api().timeline_create(
        PgVersion.NOT_SET, tenant_id, other_timeline_id
    )
    # Write after split so that child shards have some indices in S3
    workload.write_rows(100, upload=False)
    for shard in shards:
        ps = env.get_tenant_pageserver(shard)
        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
        ps.http_client().timeline_checkpoint(
            shard, timeline_id, compact=False, wait_until_uploaded=True
        )
    # The timeline still exists in child shards and they reference its layers, so scrubbing
    # now shouldn't delete anything.
    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
    assert gc_summary["remote_storage_errors"] == 0
    assert gc_summary["indices_deleted"] == 0
    assert gc_summary["ancestor_layers_deleted"] == 0
    # Delete the timeline
    env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
    # Subsequently doing physical GC should clean up the ancestor layers
    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
    assert gc_summary["remote_storage_errors"] == 0
    assert gc_summary["indices_deleted"] == 0
    assert gc_summary["ancestor_layers_deleted"] > 0
 def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
    """
    Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards
@@ -440,10 +516,12 @@ def test_scrubber_scan_pageserver_metadata(
    assert len(index.layer_metadata) > 0
    it = iter(index.layer_metadata.items())
-    scan_summary = env.storage_scrubber.scan_metadata()
+    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
    assert not scan_summary["with_warnings"]
    assert not scan_summary["with_errors"]
    assert env.storage_controller.metadata_health_is_healthy()
    # Delete a layer file that is listed in the index.
    layer, metadata = next(it)
    log.info(f"Deleting {timeline_path}/{layer.to_str()}")
@@ -453,7 +531,17 @@ def test_scrubber_scan_pageserver_metadata(
    )
    log.info(f"delete response: {delete_response}")
-    # Check scan summary. Expect it to be a L0 layer so only emit warnings.
+    # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
    scan_summary = env.storage_scrubber.scan_metadata()
    log.info(f"{pprint.pformat(scan_summary)}")
    assert len(scan_summary["with_warnings"]) > 0
    assert env.storage_controller.metadata_health_is_healthy()
    # Now post to storage controller, expect seeing one unhealthy health record
    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
    log.info(f"{pprint.pformat(scan_summary)}")
    assert len(scan_summary["with_warnings"]) > 0
    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -277,8 +277,12 @@ files:
        help: 'Bytes between received and replayed LSN'
        key_labels:
        values: [replication_delay_bytes]
        # We use a GREATEST call here because this calculation can be negative.
        # The calculation is not atomic, meaning after we've gotten the receive
        # LSN, the replay LSN may have advanced past the receive LSN we
        # are using for the calculation.
        query: |
-          SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
+          SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
      - metric_name: replication_delay_seconds
        type: gauge