test: funroll-loop first iteration in test_retried_detach_ancestor_after_failed_reparenting

test: refactor -- begin to -funroll-loops in test_retried_detach_ancestor_after_failed_reparenting
test: ensure gc is unpaused with the earlier deletion test
2026-06-05 06:20:37 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00
121 changed files with 2986 additions and 6703 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,8 +14,11 @@ inputs:
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
+  provisioner:
+    description: 'k8s-pod or k8s-neonvm'
+    default: 'k8s-pod'
  compute_units:
-    description: '[Min, Max] compute units'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'

 outputs:
@@ -34,6 +37,10 @@ runs:
      # A shell without `set -x` to not to expose password/dsn in logs
      shell: bash -euo pipefail {0}
      run: |
+        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
+          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
+        fi
+
        project=$(curl \
          "https://${API_HOST}/api/v2/projects" \
          --fail \
@@ -45,7 +52,7 @@ runs:
              \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
              \"pg_version\": ${POSTGRES_VERSION},
              \"region_id\": \"${REGION_ID}\",
-              \"provisioner\": \"k8s-neonvm\",
+              \"provisioner\": \"${PROVISIONER}\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
              \"settings\": { }
@@ -68,5 +75,6 @@ runs:
        API_KEY: ${{ inputs.api_key }}
        REGION_ID: ${{ inputs.region_id }}
        POSTGRES_VERSION: ${{ inputs.postgres_version }}
+        PROVISIONER: ${{ inputs.provisioner }}
        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -131,8 +131,8 @@ runs:
          exit 1
        fi
        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n sets the number of parallel processes that pytest-xdist will run
-          EXTRA_PARAMS="-n12 $EXTRA_PARAMS"
+          # -n16 uses sixteen processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n16 $EXTRA_PARAMS"

          # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
          # to the same worker to make @pytest.mark.order work with xdist
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,10 +19,6 @@ on:
        description: 'debug or release'
        required: true
        type: string
-      pg-versions:
-        description: 'a json array of postgres versions to run regression tests on'
-        required: true
-        type: string

 defaults:
  run:
@@ -258,7 +254,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pg_version: ${{ fromJson(inputs.pg-versions) }}
+        pg_version: [ v14, v15, v16 ]
    steps:
      - uses: actions/checkout@v4
        with:
@@ -282,11 +278,14 @@ jobs:
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
      - name: Merge and upload coverage data
        if: |
          false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
+          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,9 +63,11 @@ jobs:
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
+            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -98,6 +100,7 @@ jobs:
        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -213,11 +216,11 @@ jobs:
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
-    # - neonvm-captest-new: Freshly created project (1 CU)
-    # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neon-captest-new: Freshly created project (1 CU)
+    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
-    # - neonvm-captest-reuse: Reusing existing project
+    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
@@ -242,16 +245,18 @@ jobs:
            "'"$region_id_default"'"
            ],
          "platform": [
-            "neonvm-captest-new",
-            "neonvm-captest-reuse",
+            "neon-captest-new",
+            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

@@ -266,7 +271,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ]
        }'

@@ -282,7 +287,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ],
          "scale": [
            "10"
@@ -333,7 +338,7 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
@@ -341,18 +346,19 @@ jobs:
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
+        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -436,9 +442,9 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - PLATFORM: "neonvm-captest-pgvector"
+          - PLATFORM: "neon-captest-pgvector"
          - PLATFORM: "azure-captest-pgvector"
-
+            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -480,7 +486,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-pgvector)
+          neon-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
            ;;
          azure-captest-pgvector)
@@ -579,7 +585,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
            ;;
          rds-aurora)
@@ -589,7 +595,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -666,7 +672,7 @@ jobs:
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
@@ -676,7 +682,7 @@ jobs:
            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -753,7 +759,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
            ;;
          rds-aurora)
@@ -763,7 +769,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,8 +203,7 @@ jobs:
      fail-fast: false
      matrix:
        arch: [ x64 ]
-        # Do not build or run tests in debug for release branches
-        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
+        build-type: [ debug, release ]
        include:
          - build-type: release
            arch: arm64
@@ -214,8 +213,6 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -289,6 +286,9 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: false
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -309,7 +309,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -836,9 +836,6 @@ jobs:
          rm -rf .docker-custom

  promote-images:
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

@@ -865,28 +862,6 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

-      - name: Azure login
-        if: github.ref_name == 'main'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: github.ref_name == 'main'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Copy docker images to ACR-dev
-        if: github.ref_name == 'main'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create \
-              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
-                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -13,7 +13,6 @@ on:
    paths:
      - '.github/workflows/pg-clients.yml'
      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
      - 'poetry.lock'
  workflow_dispatch:

@@ -50,77 +49,6 @@ jobs:
      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit

-  test-logical-replication:
-    needs: [ build-build-tools-image ]
-    runs-on: ubuntu-22.04
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init --user root
-    services:
-      clickhouse:
-        image: clickhouse/clickhouse-server:24.6.3.64
-        ports:
-          - 9000:9000
-          - 8123:8123
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Download Neon artifact
-        uses: ./.github/actions/download
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-          path: /tmp/neon/
-          prefix: latest
-
-      - name: Create Neon Project
-        id: create-neon-project
-        uses: ./.github/actions/neon-project-create
-        with:
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-      - name: Run tests
-        uses: ./.github/actions/run-python-test-set
-        with:
-          build_type: remote
-          test_selection: logical_repl
-          run_in_parallel: false
-          extra_params: -m remote_cluster
-          pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        env:
-          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-
-      - name: Delete Neon Project
-        if: always()
-        uses: ./.github/actions/neon-project-delete
-        with:
-          project_id: ${{ steps.create-neon-project.outputs.project_id }}
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-      - name: Create Allure report
-        if: ${{ !cancelled() }}
-        id: create-allure-report
-        uses: ./.github/actions/allure-report-generate
-        with:
-          store-test-results-into-db: true
-        env:
-          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-
-      - name: Post to a Slack channel
-        if: github.event.schedule && failure()
-        uses: slackapi/slack-github-action@v1
-        with:
-          channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
-          slack-message: |
-            Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  test-postgres-client-libs:
    needs: [ build-build-tools-image ]
    runs-on: ubuntu-22.04
--- a/8
+++ b/8
@@ -1,13 +1,13 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /storage_controller @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
-/safekeeper/ @neondatabase/storage
+/safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1418,7 +1418,7 @@ dependencies = [
 "clap",
 "criterion-plot",
 "is-terminal",
- "itertools 0.10.5",
+ "itertools",
 "num-traits",
 "once_cell",
 "oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
 "cast",
- "itertools 0.10.5",
+ "itertools",
 ]

 [[package]]
@@ -1672,7 +1672,6 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
- "chrono",
 "diesel_derives",
 "itoa",
 "pq-sys",
@@ -2134,12 +2133,6 @@ dependencies = [
 "slab",
 ]

-[[package]]
-name = "gen_ops"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
-
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2716,6 +2709,17 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"

+[[package]]
+name = "io-lifetimes"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "io-uring"
 version = "0.6.2"
@@ -2734,13 +2738,14 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"

 [[package]]
 name = "is-terminal"
-version = "0.4.12"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
 "hermit-abi",
- "libc",
- "windows-sys 0.52.0",
+ "io-lifetimes",
+ "rustix 0.37.25",
+ "windows-sys 0.48.0",
 ]

 [[package]]
@@ -2752,15 +2757,6 @@ dependencies = [
 "either",
 ]

-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itoa"
 version = "1.0.6"
@@ -2875,6 +2871,18 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"

+[[package]]
+name = "linux-raw-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -2992,7 +3000,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
- "procfs",
+ "procfs 0.16.0",
 ]

 [[package]]
@@ -3037,7 +3045,7 @@ dependencies = [
 "measured",
 "measured-process",
 "once_cell",
- "procfs",
+ "procfs 0.14.2",
 "prometheus",
 "rand 0.8.5",
 "rand_distr",
@@ -3566,7 +3574,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "leaky-bucket",
 "md5",
 "metrics",
@@ -3584,9 +3592,8 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs",
+ "procfs 0.14.2",
 "rand 0.8.5",
- "range-set-blaze",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
@@ -3637,7 +3644,7 @@ dependencies = [
 "hex",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
@@ -3695,7 +3702,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -4027,7 +4034,7 @@ name = "postgres_connection"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "postgres",
 "tokio-postgres",
@@ -4085,7 +4092,7 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools 0.10.5",
+ "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -4131,6 +4138,21 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "procfs"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
+dependencies = [
+ "bitflags 1.3.2",
+ "byteorder",
+ "chrono",
+ "flate2",
+ "hex",
+ "lazy_static",
+ "rustix 0.36.16",
+]
+
 [[package]]
 name = "procfs"
 version = "0.16.0"
@@ -4138,12 +4160,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
- "flate2",
 "hex",
 "lazy_static",
 "procfs-core",
- "rustix",
+ "rustix 0.38.28",
 ]

 [[package]]
@@ -4153,15 +4173,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
 "hex",
 ]

 [[package]]
 name = "prometheus"
-version = "0.13.4"
+version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
 dependencies = [
 "cfg-if",
 "fnv",
@@ -4169,7 +4188,7 @@ dependencies = [
 "libc",
 "memchr",
 "parking_lot 0.12.1",
- "procfs",
+ "procfs 0.14.2",
 "thiserror",
 ]

@@ -4191,7 +4210,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
 "bytes",
 "heck 0.4.1",
- "itertools 0.10.5",
+ "itertools",
 "lazy_static",
 "log",
 "multimap",
@@ -4212,7 +4231,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "proc-macro2",
 "quote",
 "syn 1.0.109",
@@ -4269,7 +4288,7 @@ dependencies = [
 "hyper-util",
 "indexmap 2.0.1",
 "ipnet",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "md5",
 "measured",
@@ -4445,18 +4464,6 @@ dependencies = [
 "rand_core 0.5.1",
 ]

-[[package]]
-name = "range-set-blaze"
-version = "0.1.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
-dependencies = [
- "gen_ops",
- "itertools 0.12.1",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -4625,7 +4632,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pin-project-lite",
@@ -4935,6 +4942,34 @@ dependencies = [
 "nom",
 ]

+[[package]]
+name = "rustix"
+version = "0.36.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.1.4",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "rustix"
 version = "0.38.28"
@@ -5683,7 +5718,6 @@ dependencies = [
 "aws-config",
 "bytes",
 "camino",
- "chrono",
 "clap",
 "control_plane",
 "diesel",
@@ -5694,7 +5728,7 @@ dependencies = [
 "hex",
 "humantime",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "measured",
 "metrics",
@@ -5703,7 +5737,6 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "r2d2",
- "rand 0.8.5",
 "reqwest 0.12.4",
 "routerify",
 "scopeguard",
@@ -5759,10 +5792,9 @@ dependencies = [
 "either",
 "futures",
 "futures-util",
- "git-version",
 "hex",
 "humantime",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "pageserver",
 "pageserver_api",
@@ -5939,15 +5971,15 @@ dependencies = [

 [[package]]
 name = "tempfile"
-version = "3.9.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
 dependencies = [
 "cfg-if",
- "fastrand 2.0.0",
- "redox_syscall 0.4.1",
- "rustix",
- "windows-sys 0.52.0",
+ "fastrand 1.9.0",
+ "redox_syscall 0.3.5",
+ "rustix 0.37.25",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -7144,6 +7176,15 @@ dependencies = [
 "windows_x86_64_msvc 0.42.2",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7162,6 +7203,21 @@ dependencies = [
 "windows-targets 0.52.4",
 ]

+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
@@ -7391,7 +7447,7 @@ dependencies = [
 "hmac",
 "hyper 0.14.26",
 "indexmap 1.9.3",
- "itertools 0.10.5",
+ "itertools",
 "libc",
 "log",
 "memchr",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.16"
+procfs = "0.14"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.0
+ENV RUSTC_VERSION=1.79.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,11 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test specific features.
-testing = []
-
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,15 +400,7 @@ impl ComputeNode {
    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let mut retry_period_ms = 500.0;
        let mut attempts = 0;
-        const DEFAULT_ATTEMPTS: u16 = 10;
-        #[cfg(feature = "testing")]
-        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
-            u16::from_str(&v).unwrap()
-        } else {
-            DEFAULT_ATTEMPTS
-        };
-        #[cfg(not(feature = "testing"))]
-        let max_attempts = DEFAULT_ATTEMPTS;
+        let max_attempts = 10;
        loop {
            let result = self.try_get_basebackup(compute_state, lsn);
            match result {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {

 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_") {
+        if var.starts_with("NEON_PAGESERVER_") {
            cmd = cmd.env(var, val);
        }
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -514,6 +514,7 @@ impl LocalEnv {
                #[derive(serde::Serialize, serde::Deserialize)]
                // (allow unknown fields, unlike PageServerConf)
                struct PageserverConfigTomlSubset {
+                    id: NodeId,
                    listen_pg_addr: String,
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
@@ -525,30 +526,18 @@ impl LocalEnv {
                        .with_context(|| format!("read {:?}", config_toml_path))?,
                )
                .context("parse pageserver.toml")?;
-                let identity_toml_path = dentry.path().join("identity.toml");
-                #[derive(serde::Serialize, serde::Deserialize)]
-                struct IdentityTomlSubset {
-                    id: NodeId,
-                }
-                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
-                    &std::fs::read_to_string(&identity_toml_path)
-                        .with_context(|| format!("read {:?}", identity_toml_path))?,
-                )
-                .context("parse identity.toml")?;
                let PageserverConfigTomlSubset {
+                    id: config_toml_id,
                    listen_pg_addr,
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
                } = config_toml;
-                let IdentityTomlSubset {
-                    id: identity_toml_id,
-                } = identity_toml;
                let conf = PageServerConf {
                    id: {
                        anyhow::ensure!(
-                            identity_toml_id == id,
-                            "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
+                            config_toml_id == id,
+                            "id mismatch: config_toml.id={config_toml_id} id={id}",
                        );
                        id
                    },
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -127,13 +127,10 @@ impl PageServerNode {
        }

        // Apply the user-provided overrides
-        overrides.push({
-            let mut doc =
-                toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
-            // `id` is written out to `identity.toml` instead of `pageserver.toml`
-            doc.remove("id").expect("it's part of the struct");
-            doc.to_string()
-        });
+        overrides.push(
+            toml_edit::ser::to_string_pretty(&conf)
+                .expect("we deserialized this from toml earlier"),
+        );

        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,6 +1,5 @@
-use std::collections::HashSet;
 use std::str::FromStr;
-use std::time::{Duration, Instant};
+use std::time::Instant;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -295,42 +294,6 @@ pub enum PlacementPolicy {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

-/// Metadata health record posted from scrubber.
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthRecord {
-    pub tenant_shard_id: TenantShardId,
-    pub healthy: bool,
-    pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateRequest {
-    pub healthy_tenant_shards: HashSet<TenantShardId>,
-    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateResponse {}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListUnhealthyResponse {
-    pub unhealthy_tenant_shards: Vec<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedRequest {
-    #[serde(with = "humantime_serde")]
-    pub not_scrubbed_for: Duration,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedResponse {
-    pub health_records: Vec<MetadataHealthRecord>,
-}
-
 #[cfg(test)]
 mod test {
    use super::*;
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,8 @@
+use std::collections::HashSet;
+
 use utils::id::TimelineId;

 #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
-    pub reparented_timelines: Vec<TimelineId>,
+    pub reparented_timelines: HashSet<TimelineId>,
 }
--- a/libs/postgres_ffi/src/controlfile_utils.rs
+++ b/libs/postgres_ffi/src/controlfile_utils.rs
@@ -29,7 +29,7 @@ use anyhow::{bail, Result};
 use bytes::{Bytes, BytesMut};

 /// Equivalent to sizeof(ControlFileData) in C
-const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
+const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();

 impl ControlFileData {
    /// Compute the offset of the `crc` field within the `ControlFileData` struct.
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
 //

 // Assumes 8 byte alignment
-const SIZEOF_PAGE_HEADER_DATA: usize = size_of::<PageHeaderData>();
+const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::<PageHeaderData>();
 pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7;

 //
@@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;

-pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
+pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::<XLogRecord>() as u32;

 //
 // from xlogrecord.h
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;

-pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::<XLogPageHeaderData>();
-pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::<XLogLongPageHeaderData>();
-pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::<XLogRecord>();
+pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

@@ -311,7 +311,7 @@ impl XLogLongPageHeaderData {
    }
 }

-pub const SIZEOF_CHECKPOINT: usize = size_of::<CheckPoint>();
+pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();

 impl CheckPoint {
    pub fn encode(&self) -> Result<Bytes, SerializeError> {
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() {
 /// currently 1024.
 #[test]
 pub fn test_update_next_xid() {
-    let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();

    checkpoint.nextXid = FullTransactionId { value: 10 };
@@ -204,7 +204,7 @@ pub fn test_update_next_xid() {

 #[test]
 pub fn test_update_next_multixid() {
-    let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();

    // simple case
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -355,8 +355,7 @@ impl RemoteStorage for AzureBlobStorage {
                    .blobs()
                    .map(|k| ListingObject{
                        key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into(),
-                        size: k.properties.content_length,
+                        last_modified: k.properties.last_modified.into()
                    }
                    );

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,7 +144,6 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
-#[derive(Copy, Clone)]
 pub enum ListingMode {
    WithDelimiter,
    NoDelimiter,
@@ -154,7 +153,6 @@ pub enum ListingMode {
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
-    pub size: u64,
 }

 #[derive(Default)]
@@ -196,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>>;

    async fn list(
        &self,
@@ -353,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
        match self {
            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -368,7 +368,6 @@ impl RemoteStorage for LocalFs {
                            key: k.clone(),
                            // LocalFs is just for testing, so just specify a dummy time
                            last_modified: SystemTime::now(),
-                            size: 0,
                        })
                    }
                })
@@ -412,7 +411,6 @@ impl RemoteStorage for LocalFs {
                            key: RemotePath::from_string(&relative_key).unwrap(),
                            // LocalFs is just for testing
                            last_modified: SystemTime::now(),
-                            size: 0,
                        });
                    }
                }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -565,12 +565,9 @@ impl RemoteStorage for S3Bucket {
                        }
                    };

-                    let size = object.size.unwrap_or(0) as u64;
-
                    result.keys.push(ListingObject{
                        key,
-                        last_modified,
-                        size,
+                        last_modified
                    });
                    if let Some(mut mk) = max_keys {
                        assert!(mk > 0);
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
        async_stream::stream! {
            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
                .map_err(DownloadError::Other)?;
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
-    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
+    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
    // TODO: join these two?
    Tenant,
-    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
-    /// Should only be used e.g. for status check/tenant creation/list.
+    // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+    // Should only be used e.g. for status check/tenant creation/list.
    PageServerApi,
-    /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
-    /// Should only be used e.g. for status check.
-    /// Currently also used for connection from any pageserver to any safekeeper.
+    // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+    // Should only be used e.g. for status check.
+    // Currently also used for connection from any pageserver to any safekeeper.
    SafekeeperData,
-    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
+    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    /// Allows access to control plane managment API and some storage controller endpoints.
+    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,

    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -8,10 +8,33 @@ pub struct Completion {
    _token: TaskTrackerToken,
 }

+impl std::fmt::Debug for Completion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Completion")
+            .field("siblings", &self._token.task_tracker().len())
+            .finish()
+    }
+}
+
+impl Completion {
+    /// Returns true if this completion is associated with the given barrier.
+    pub fn blocks(&self, barrier: &Barrier) -> bool {
+        TaskTracker::ptr_eq(self._token.task_tracker(), &barrier.0)
+    }
+}
+
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
 pub struct Barrier(TaskTracker);

+impl std::fmt::Debug for Barrier {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Barrier")
+            .field("remaining", &self.0.len())
+            .finish()
+    }
+}
+
 impl Default for Barrier {
    fn default() -> Self {
        let (_, rx) = channel();
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,7 +49,6 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
-range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,4 +1,3 @@
-use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -16,11 +15,7 @@ use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

-use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
-
-fn fixture_path(relative: &str) -> PathBuf {
-    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-}
+use criterion::{black_box, criterion_group, criterion_main, Criterion};

 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    let mut layer_map = LayerMap::default();
@@ -114,7 +109,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
    // TODO consider compressing this file
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);

    // Test with uniform query pattern
@@ -144,7 +139,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
    // Init layer map
    let now = Instant::now();
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    println!("Finished layer map init in {:?}", now.elapsed());

    // Choose uniformly distributed queries
@@ -247,72 +242,7 @@ fn bench_sequential(c: &mut Criterion) {
    group.finish();
 }

-fn bench_visibility_with_map(
-    group: &mut BenchmarkGroup<WallTime>,
-    layer_map: LayerMap,
-    read_points: Vec<Lsn>,
-    bench_name: &str,
-) {
-    group.bench_function(bench_name, |b| {
-        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
-    });
-}
-
-// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
-fn bench_visibility(c: &mut Criterion) {
-    let mut group = c.benchmark_group("visibility");
-    {
-        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
-        let now = Instant::now();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for i in 0..100_000 {
-            let i32 = (i as u32) % 100;
-            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-            let layer = PersistentLayerDesc::new_img(
-                TenantShardId::unsharded(TenantId::generate()),
-                TimelineId::generate(),
-                zero.add(10 * i32)..zero.add(10 * i32 + 1),
-                Lsn(i),
-                0,
-            );
-            updates.insert_historic(layer);
-        }
-        updates.flush();
-        println!("Finished layer map init in {:?}", now.elapsed());
-
-        let mut read_points = Vec::new();
-        for i in (0..100_000).step_by(1000) {
-            read_points.push(Lsn(i));
-        }
-
-        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
-    }
-
-    {
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![Lsn(0x1C760FA190)];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
-
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![
-            Lsn(0x1C760FA190),
-            Lsn(0x000000931BEAD539),
-            Lsn(0x000000931BF63011),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x00000038E67ABFA0),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x000000914E3F38F0),
-            Lsn(0x000000931B33AE68),
-        ];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
-    }
-
-    group.finish();
-}
-
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_group!(group_4, bench_visibility);
-criterion_main!(group_1, group_2, group_3, group_4);
+criterion_main!(group_1, group_2, group_3);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -29,9 +31,11 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
-    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -125,7 +129,6 @@ fn main() -> anyhow::Result<()> {
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.get_impl, "starting with get page implementation");
    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -590,13 +593,30 @@ fn start_pageserver(

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -624,7 +644,7 @@ fn start_pageserver(
            shutdown_pageserver.take();
            pageserver::shutdown_pageserver(
                http_endpoint_listener,
-                page_service,
+                libpq_listener,
                consumption_metrics_tasks,
                disk_usage_eviction_task,
                &tenant_manager,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,7 +29,6 @@ use utils::{
    logging::LogFormat,
 };

-use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -53,7 +52,7 @@ pub mod defaults {
    use pageserver_api::models::ImageCompressionAlgorithm;
    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;

-    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
@@ -84,16 +83,16 @@ pub mod defaults {
    #[cfg(not(target_os = "linux"))]
    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";

-    pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
+    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

-    pub const DEFAULT_GET_IMPL: &str = "vectored";
+    pub const DEFAULT_GET_IMPL: &str = "legacy";

    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
        ImageCompressionAlgorithm::Disabled;

-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

@@ -296,10 +295,6 @@ pub struct PageServerConf {
    pub ephemeral_bytes_per_memory_kb: usize,

    pub l0_flush: L0FlushConfig,
-
-    /// This flag is temporary and will be removed after gradual rollout.
-    /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -361,6 +356,8 @@ struct PageServerConfigBuilder {
    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

+    id: BuilderValue<NodeId>,
+
    broker_endpoint: BuilderValue<Uri>,
    broker_keepalive_interval: BuilderValue<Duration>,

@@ -406,13 +403,14 @@ struct PageServerConfigBuilder {
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,

    l0_flush: BuilderValue<L0FlushConfig>,
-
-    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
 }

 impl PageServerConfigBuilder {
-    fn new() -> Self {
-        Self::default()
+    fn new(node_id: NodeId) -> Self {
+        let mut this = Self::default();
+        this.id(node_id);
+
+        this
    }

    #[inline(always)]
@@ -440,6 +438,7 @@ impl PageServerConfigBuilder {
            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
            remote_storage_config: Set(None),
+            id: NotSet,
            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                .parse()
                .expect("failed to parse default broker endpoint")),
@@ -497,7 +496,6 @@ impl PageServerConfigBuilder {
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
-            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
        }
    }
 }
@@ -570,6 +568,10 @@ impl PageServerConfigBuilder {
        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
    }

+    pub fn id(&mut self, node_id: NodeId) {
+        self.id = BuilderValue::Set(node_id)
+    }
+
    pub fn log_format(&mut self, log_format: LogFormat) {
        self.log_format = BuilderValue::Set(log_format)
    }
@@ -681,11 +683,7 @@ impl PageServerConfigBuilder {
        self.l0_flush = BuilderValue::Set(value);
    }

-    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
-        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
-    }
-
-    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
+    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

        macro_rules! conf {
@@ -718,6 +716,7 @@ impl PageServerConfigBuilder {
                pg_auth_type,
                auth_validation_public_key_path,
                remote_storage_config,
+                id,
                broker_endpoint,
                broker_keepalive_interval,
                log_format,
@@ -742,11 +741,9 @@ impl PageServerConfigBuilder {
                image_compression,
                ephemeral_bytes_per_memory_kb,
                l0_flush,
-                compact_level0_phase1_value_access,
            }
            CUSTOM LOGIC
            {
-                id: id,
                // TenantConf is handled separately
                default_tenant_conf: TenantConf::default(),
                concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -896,7 +893,7 @@ impl PageServerConf {
        toml: &Document,
        workdir: &Utf8Path,
    ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new();
+        let mut builder = PageServerConfigBuilder::new(node_id);
        builder.workdir(workdir.to_owned());

        let mut t_conf = TenantConfOpt::default();
@@ -927,6 +924,8 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
+                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
+                            // Logging is not set up yet, so we can't do it.
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -1015,14 +1014,11 @@ impl PageServerConf {
                "l0_flush" => {
                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
                }
-                "compact_level0_phase1_value_access" => {
-                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }

-        let mut conf = builder.build(node_id).context("invalid config")?;
+        let mut conf = builder.build().context("invalid config")?;

        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
            let auth_validation_public_key_path = conf
@@ -1102,7 +1098,6 @@ impl PageServerConf {
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
        }
    }
 }
@@ -1260,6 +1255,7 @@ max_file_descriptors = 333

 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
+id = 10

 metric_collection_interval = '222 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1276,8 +1272,9 @@ background_task_maximum_delay = '334 s'
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
        // we have to create dummy values to overcome the validation errors
-        let config_string =
-            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
+        let config_string = format!(
+            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
+        );
        let toml = config_string.parse()?;

        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1344,7 +1341,6 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1419,7 +1415,6 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1584,6 +1579,7 @@ broker_endpoint = '{broker_endpoint}'
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [disk_usage_based_eviction]
 max_usage_pct = 80
@@ -1653,6 +1649,7 @@ threshold = "20m"
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [tenant_config]
 evictions_low_residence_duration_metric_threshold = "20m"
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => {
                ApiError::ResourceUnavailable(format!("{}", e).into())
            }
-            GetActiveTenantError::SwitchedTenant => {
-                // in our HTTP handlers, this error doesn't happen
-                // TODO: separate error types
-                ApiError::ResourceUnavailable("switched tenant".into())
-            }
        }
    }
 }
@@ -1816,7 +1811,7 @@ async fn timeline_detach_ancestor_handler(
        // drop(tenant);

        let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+            detach_ancestor::Progress::Prepared(attempt, prepared) => {
                // it would be great to tag the guard on to the tenant activation future
                let reparented_timelines = state
                    .tenant_manager
@@ -1824,10 +1819,10 @@ async fn timeline_detach_ancestor_handler(
                        tenant_shard_id,
                        timeline_id,
                        prepared,
+                        attempt,
                        ctx,
                    )
                    .await
-                    .context("timeline detach ancestor completion")
                    .map_err(ApiError::InternalServerError)?;

                AncestorDetached {
@@ -2134,24 +2129,14 @@ async fn secondary_download_handler(

    let timeout = wait.unwrap_or(Duration::MAX);

-    let result = tokio::time::timeout(
+    let status = match tokio::time::timeout(
        timeout,
        state.secondary_controller.download_tenant(tenant_shard_id),
    )
-    .await;
-
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
-    let status = match result {
-        Ok(Ok(())) => {
-            if progress.layers_downloaded >= progress.layers_total {
-                // Download job ran to completion
-                StatusCode::OK
-            } else {
-                // Download dropped out without errors because it ran out of time budget
-                StatusCode::ACCEPTED
-            }
-        }
+    .await
+    {
+        // Download job ran to completion.
+        Ok(Ok(())) => StatusCode::OK,
        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
        // okay.  We could get an error here in the unlikely edge case that the tenant
        // was detached between our check above and executing the download job.
@@ -2161,6 +2146,8 @@ async fn secondary_download_handler(
        Err(_) => StatusCode::ACCEPTED,
    };

+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
    json_response(status, progress)
 }

--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -2,23 +2,13 @@ use std::{num::NonZeroUsize, sync::Arc};

 use crate::tenant::ephemeral_file;

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
+    #[default]
    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
-}
-
-impl Default for L0FlushConfig {
-    fn default() -> Self {
-        Self::Direct {
-            // TODO: using num_cpus results in different peak memory usage on different instance types.
-            max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
-        }
-    }
+    Direct { max_concurrency: NonZeroUsize },
 }

 #[derive(Clone)]
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
-
-use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -32,13 +30,14 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
    mgr::{BackgroundPurges, TenantManager},
    secondary,
 };
-use tracing::{info, info_span};
+use tracing::info;

 /// Current storage format version
 ///
@@ -64,6 +63,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
-    page_service: page_service::Listener,
+    libpq_listener: LibpqEndpointListener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
    exit_code: i32,
 ) {
    use std::time::Duration;
-
-    // If the orderly shutdown below takes too long, we still want to make
-    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
-    //
-    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
-    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
-    //
-    // We use a thread instead of a tokio task because the background runtime is likely busy
-    // with the final flushing / uploads. This activity here has priority, and due to lack
-    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
-    // an effective priority booster.
-    let walredo_extraordinary_shutdown_thread_span = {
-        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
-        span.follows_from(tracing::Span::current());
-        span
-    };
-    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
-    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
-        let walredo_extraordinary_shutdown_thread_cancel =
-            walredo_extraordinary_shutdown_thread_cancel.clone();
-        move || {
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .unwrap();
-            let _entered = rt.enter();
-            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
-            if let Ok(()) = rt.block_on(tokio::time::timeout(
-                Duration::from_secs(8),
-                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
-            )) {
-                info!("cancellation requested");
-                return;
-            }
-            let managers = tenant::WALREDO_MANAGERS
-                .lock()
-                .unwrap()
-                // prevents new walredo managers from being inserted
-                .take()
-                .expect("only we take()");
-            // Use FuturesUnordered to get in queue early for each manager's
-            // heavier_once_cell semaphore wait list.
-            // Also, for idle tenants that for some reason haven't
-            // shut down yet, it's quite likely that we're not going
-            // to get Poll::Pending once.
-            let mut futs: FuturesUnordered<_> = managers
-                .into_iter()
-                .filter_map(|(_, mgr)| mgr.upgrade())
-                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
-                .collect();
-            info!(count=%futs.len(), "built FuturesUnordered");
-            let mut last_log_at = std::time::Instant::now();
-            #[derive(Debug, Default)]
-            struct Results {
-                initiated: u64,
-                already: u64,
-            }
-            let mut results = Results::default();
-            while let Some(we_initiated) = rt.block_on(futs.next()) {
-                if we_initiated {
-                    results.initiated += 1;
-                } else {
-                    results.already += 1;
-                }
-                if last_log_at.elapsed() > Duration::from_millis(100) {
-                    info!(remaining=%futs.len(), ?results, "progress");
-                    last_log_at = std::time::Instant::now();
-                }
-            }
-            info!(?results, "done");
-        }
-    });
-
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    let remaining_connections = timed(
-        page_service.stop_accepting(),
+    timed(
+        libpq_listener.0.shutdown(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
-        remaining_connections.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
        Duration::from_secs(1),
    )
    .await;
-
-    info!("cancel & join walredo_extraordinary_shutdown_thread");
-    walredo_extraordinary_shutdown_thread_cancel.cancel();
-    walredo_extraordinary_shutdown_thread.join().unwrap();
-    info!("walredo_extraordinary_shutdown_thread done");
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,15 +525,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_visible_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
        "pageserver_resident_physical_size_global",
@@ -622,23 +613,7 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_compression_image_in_bytes_total",
-        "Size of data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_considered",
-        "Size of potentially compressible data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_chosen",
-        "Size of data whose compressed form was written into image layers"
+        "Size of uncompressed data written into image layers"
    )
    .expect("failed to define a metric")
 });
@@ -2213,7 +2188,6 @@ pub(crate) struct TimelineMetrics {
    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
-    pub visible_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub aux_file_size_gauge: IntGauge,
@@ -2336,9 +2310,6 @@ impl TimelineMetrics {
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2393,7 +2364,6 @@ impl TimelineMetrics {
            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
-            visible_physical_size_gauge,
            current_logical_size_gauge,
            aux_file_size_gauge,
            directory_entries_count_gauge,
@@ -2445,7 +2415,6 @@ impl TimelineMetrics {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
-        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -8,7 +8,8 @@ use std::time::Duration;
 pub use pageserver_api::key::{Key, KEY_SIZE};

 /// A 'value' stored for a one Key.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(test, derive(PartialEq))]
 pub enum Value {
    /// An Image value contains a full copy of the value
    Image(Bytes),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,9 +33,9 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
-use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::detach_ancestor;
 use tokio::io::BufReader;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
@@ -103,7 +103,8 @@ use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::sync::Mutex;
 use std::time::{Duration, Instant};
@@ -300,8 +301,11 @@ pub struct Tenant {
    pub(crate) timeline_get_throttle:
        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,

-    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
-    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
+    /// An ongoing timeline detach must be checked during attempts to GC a timeline.
+    ///
+    /// After starting the timeline detach ancestor, blocking GC until it completes allows retrying
+    /// the ancestor detach, until we can be certain that all reparentings have been done.
+    ongoing_timeline_detach: timeline::detach_ancestor::SharedState,

    l0_flush_global_state: L0FlushGlobalState,
 }
@@ -313,66 +317,14 @@ impl std::fmt::Debug for Tenant {
 }

 pub(crate) enum WalRedoManager {
-    Prod(WalredoManagerId, PostgresRedoManager),
+    Prod(PostgresRedoManager),
    #[cfg(test)]
    Test(harness::TestRedoManager),
 }

-#[derive(thiserror::Error, Debug)]
-#[error("pageserver is shutting down")]
-pub(crate) struct GlobalShutDown;
-
-impl WalRedoManager {
-    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
-        let id = WalredoManagerId::next();
-        let arc = Arc::new(Self::Prod(id, mgr));
-        let mut guard = WALREDO_MANAGERS.lock().unwrap();
-        match &mut *guard {
-            Some(map) => {
-                map.insert(id, Arc::downgrade(&arc));
-                Ok(arc)
-            }
-            None => Err(GlobalShutDown),
-        }
-    }
-}
-
-impl Drop for WalRedoManager {
-    fn drop(&mut self) {
-        match self {
-            Self::Prod(id, _) => {
-                let mut guard = WALREDO_MANAGERS.lock().unwrap();
-                if let Some(map) = &mut *guard {
-                    map.remove(id).expect("new() registers, drop() unregisters");
-                }
-            }
-            #[cfg(test)]
-            Self::Test(_) => {
-                // Not applicable to test redo manager
-            }
-        }
-    }
-}
-
-/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
-/// the walredo processes outside of the regular order.
-///
-/// This is necessary to work around a systemd bug where it freezes if there are
-/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
-#[allow(clippy::type_complexity)]
-pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
-    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
-> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
-#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
-pub(crate) struct WalredoManagerId(u64);
-impl WalredoManagerId {
-    pub fn next() -> Self {
-        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
-        }
-        Self(id)
+impl From<PostgresRedoManager> for WalRedoManager {
+    fn from(mgr: PostgresRedoManager) -> Self {
+        Self::Prod(mgr)
    }
 }

@@ -384,20 +336,19 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
-    pub(crate) async fn shutdown(&self) -> bool {
+    pub(crate) async fn shutdown(&self) {
        match self {
-            Self::Prod(_, mgr) => mgr.shutdown().await,
+            Self::Prod(mgr) => mgr.shutdown().await,
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
-                true
            }
        }
    }

    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
-            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
+            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
@@ -417,7 +368,7 @@ impl WalRedoManager {
        pg_version: u32,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
-            Self::Prod(_, mgr) => {
+            Self::Prod(mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
                    .await
            }
@@ -431,7 +382,7 @@ impl WalRedoManager {

    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(_, m) => Some(m.status()),
+            WalRedoManager::Prod(m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -440,8 +391,6 @@ impl WalRedoManager {

 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
-    #[error("Timeline is shutting down")]
-    ShuttingDown,
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
        tenant_id: TenantShardId,
@@ -730,10 +679,13 @@ impl Tenant {
        shard_identity: ShardIdentity,
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
+        existing_detach_attempt: Option<&detach_ancestor::Attempt>,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
-        let wal_redo_manager =
-            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
+    ) -> Arc<Tenant> {
+        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+            conf,
+            tenant_shard_id,
+        )));

        let TenantSharedResources {
            broker_client,
@@ -757,6 +709,12 @@ impl Tenant {
            l0_flush_global_state,
        ));

+        if let Some(attempt) = existing_detach_attempt {
+            tenant
+                .ongoing_timeline_detach
+                .continue_existing_attempt(attempt);
+        }
+
        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
        // we shut down while attaching.
        let attach_gate_guard = tenant
@@ -809,9 +767,9 @@ impl Tenant {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
                            assert!(
-                            matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                            "the attach task owns the tenant state until activation is complete"
-                        );
+                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
+                                "the attach task owns the tenant state until activation is complete"
+                            );

                            *state = TenantState::broken_from_reason(err.to_string());
                        });
@@ -932,7 +890,7 @@ impl Tenant {
            }
            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
-        Ok(tenant)
+        tenant
    }

    #[instrument(skip_all)]
@@ -1036,6 +994,8 @@ impl Tenant {
            }
        }

+        let mut shared_state_builder = timeline::detach_ancestor::SharedStateBuilder::default();
+
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
@@ -1045,6 +1005,8 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

+            shared_state_builder.record_loading_timeline(&timeline_id, &index_part);
+
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
@@ -1089,6 +1051,8 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

+        shared_state_builder.build(&self.ongoing_timeline_detach);
+
        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
@@ -1280,29 +1244,11 @@ impl Tenant {
        Ok(timeline_preloads)
    }

-    pub(crate) async fn apply_timeline_archival_config(
+    pub async fn apply_timeline_archival_config(
        &self,
-        timeline_id: TimelineId,
-        state: TimelineArchivalState,
+        _timeline_id: TimelineId,
+        _config: TimelineArchivalState,
    ) -> anyhow::Result<()> {
-        let timeline = self
-            .get_timeline(timeline_id, false)
-            .context("Cannot apply timeline archival config to inexistent timeline")?;
-
-        let upload_needed = timeline
-            .remote_client
-            .schedule_index_upload_for_timeline_archival_state(state)?;
-
-        if upload_needed {
-            const MAX_WAIT: Duration = Duration::from_secs(10);
-            let Ok(v) =
-                tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
-            else {
-                tracing::warn!("reached timeout for waiting on upload queue");
-                bail!("reached timeout for upload queue flush");
-            };
-            v?;
-        }
        Ok(())
    }

@@ -1634,7 +1580,7 @@ impl Tenant {
        self: Arc<Self>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id).await?;
+        DeleteTimelineFlow::run(&self, timeline_id, false).await?;

        Ok(())
    }
@@ -1679,6 +1625,11 @@ impl Tenant {
            }
        }

+        if self.ongoing_timeline_detach.attempt_blocks_gc() {
+            info!("Skipping GC while there is an ongoing detach_ancestor attempt");
+            return Ok(GcResult::default());
+        }
+
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }
@@ -1687,23 +1638,21 @@ impl Tenant {
    /// This function is periodically called by compactor task.
    /// Also it can be explicitly requested per timeline through page server
    /// api's 'compact' command.
-    ///
-    /// Returns whether we have pending compaction task.
    async fn compaction_iteration(
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<bool, timeline::CompactionError> {
+    ) -> Result<(), timeline::CompactionError> {
        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
        if !self.is_active() {
-            return Ok(false);
+            return Ok(());
        }

        {
            let conf = self.tenant_conf.load();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(false);
+                return Ok(());
            }
        }

@@ -1730,13 +1679,11 @@ impl Tenant {
        // Before doing any I/O work, check our circuit breaker
        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
            info!("Skipping compaction due to previous failures");
-            return Ok(false);
+            return Ok(());
        }

-        let mut has_pending_task = false;
-
        for (timeline_id, timeline) in &timelines_to_compact {
-            has_pending_task |= timeline
+            timeline
                .compact(cancel, EnumSet::empty(), ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
                .await
@@ -1756,7 +1703,7 @@ impl Tenant {
            .unwrap()
            .success(&CIRCUIT_BREAKERS_UNBROKEN);

-        Ok(has_pending_task)
+        Ok(())
    }

    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
@@ -2690,7 +2637,7 @@ impl Tenant {
                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
-            ongoing_timeline_detach: std::sync::Mutex::default(),
+            ongoing_timeline_detach: Default::default(),
            l0_flush_global_state,
        }
    }
@@ -6963,11 +6910,7 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: {
-                        let mut key = Key::MAX;
-                        key.field6 -= 1;
-                        Key::MIN..key
-                    },
+                    key_range: Key::MIN..Key::MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
@@ -6986,15 +6929,6 @@ mod tests {
            ]
        );

-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
-            guard.cutoffs.space = Lsn(0x40);
-        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-
        Ok(())
    }

@@ -7346,15 +7280,6 @@ mod tests {
            );
        }

-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
-            guard.cutoffs.space = Lsn(0x40);
-        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-
        Ok(())
    }

@@ -7423,7 +7348,6 @@ mod tests {
                Lsn(0x60),
                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                3,
-                None,
            )
            .await
            .unwrap();
@@ -7548,7 +7472,7 @@ mod tests {
            ),
        ];
        let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
@@ -7594,114 +7518,6 @@ mod tests {
        };
        assert_eq!(res, expected_res);

-        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
-        // the ancestor image in the test case.
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![(
-                Lsn(0x60),
-                KeyLogAtLsn(vec![(
-                    Lsn(0x60),
-                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
-                )]),
-            )],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x60),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[Lsn(0x30)],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![
-                (
-                    Lsn(0x30),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x20),
-                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-                    )]),
-                ),
-                (
-                    Lsn(0x60),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x60),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
-                    )]),
-                ),
-            ],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
        Ok(())
    }

@@ -7859,10 +7675,6 @@ mod tests {
        ];

        let verify_result = || async {
-            let gc_horizon = {
-                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
-            };
            for idx in 0..10 {
                assert_eq!(
                    tline
@@ -7873,7 +7685,7 @@ mod tests {
                );
                assert_eq!(
                    tline
-                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
@@ -7899,205 +7711,6 @@ mod tests {

        let cancel = CancellationToken::new();
        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-        verify_result().await;
-
-        // compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-        verify_result().await;
-
-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x38);
-            guard.cutoffs.space = Lsn(0x38);
-        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
-
-        // not increasing the GC horizon and compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-        verify_result().await;
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x28),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(5),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(6),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-        ];
-
-        let parent_tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![],                       // delta layers
-                vec![(Lsn(0x18), img_layer)], // image layers
-                Lsn(0x18),
-            )
-            .await?;
-
-        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        let branch_tline = tenant
-            .branch_timeline_test_with_layers(
-                &parent_tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x18)),
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-
-        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        {
-            // Update GC info
-            let mut guard = parent_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x10),
-                    space: Lsn(0x10),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        {
-            // Update GC info
-            let mut guard = branch_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x50),
-                    space: Lsn(0x50),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        let expected_result_at_gc_horizon = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
-        ];
-
-        let expected_result_at_lsn_40 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let verify_result = || async {
-            for idx in 0..10 {
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_gc_horizon[idx]
-                );
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_40[idx]
-                );
-            }
-        };
-
-        verify_result().await;
-
-        let cancel = CancellationToken::new();
-        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        verify_result().await;

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,12 +28,6 @@ use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-#[derive(Copy, Clone, Debug)]
-pub struct CompressionInfo {
-    pub written_compressed: bool,
-    pub compressed_size: Option<usize>,
-}
-
 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
@@ -279,10 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        let (buf, res) = self
-            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await;
-        (buf, res.map(|(off, _compression_info)| off))
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -292,12 +284,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
        algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
+    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;
-        let mut compression_info = CompressionInfo {
-            written_compressed: false,
-            compressed_size: None,
-        };

        let len = srcbuf.bytes_init();

@@ -340,9 +328,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        encoder.write_all(&slice[..]).await.unwrap();
                        encoder.shutdown().await.unwrap();
                        let compressed = encoder.into_inner();
-                        compression_info.compressed_size = Some(compressed.len());
                        if compressed.len() < len {
-                            compression_info.written_compressed = true;
                            let compressed_len = compressed.len();
                            compressed_buf = Some(compressed);
                            (BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -373,7 +359,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        } else {
            self.write_all(srcbuf, ctx).await
        };
-        (srcbuf, res.map(|_| (offset, compression_info)))
+        (srcbuf, res.map(|_| offset))
    }
 }

@@ -430,14 +416,12 @@ pub(crate) mod tests {
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
-                    let res = wtr
-                        .write_blob_maybe_compressed(
-                            blob.clone(),
-                            ctx,
-                            ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                        )
-                        .await;
-                    (res.0, res.1.map(|(off, _)| off))
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        ctx,
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                    )
+                    .await
                } else {
                    wtr.write_blob(blob.clone(), ctx).await
                };
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -296,19 +296,13 @@ where
            let mut stack = Vec::new();
            stack.push((self.root_blk, None));
            let block_cursor = self.reader.block_cursor();
-            let mut node_buf = [0_u8; PAGE_SZ];
            while let Some((node_blknum, opt_iter)) = stack.pop() {
-                // Read the node, through the PS PageCache, into local variable `node_buf`.
-                // We could keep the page cache read guard alive, but, at the time of writing,
-                // we run quite small PS PageCache s => can't risk running out of
-                // PageCache space because this stream isn't consumed fast enough.
-                let page_read_guard = block_cursor
+                // Locate the node.
+                let node_buf = block_cursor
                    .read_blk(self.start_blk + node_blknum, ctx)
                    .await?;
-                node_buf.copy_from_slice(page_read_guard.as_ref());
-                drop(page_read_guard); // drop page cache read guard early

-                let node = OnDiskNode::deparse(&node_buf)?;
+                let node = OnDiskNode::deparse(node_buf.as_ref())?;
                let prefix_len = node.prefix_len as usize;
                let suffix_len = node.suffix_len as usize;

@@ -351,7 +345,6 @@ where
                    Either::Left(idx..node.num_children.into())
                };

-
                // idx points to the first match now. Keep going from there
                while let Some(idx) = iter.next() {
                    let key_off = idx * suffix_len;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,8 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
-use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
+use pageserver_api::keyspace::KeySpaceAccum;
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -62,7 +61,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;

-use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
+use super::storage_layer::PersistentLayerDesc;

 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -872,183 +871,11 @@ impl LayerMap {
        println!("End dump LayerMap");
        Ok(())
    }
-
-    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
-    /// where we expect to serve reads.
-    ///
-    /// This function is O(N) and should be called infrequently.  The caller is responsible for
-    /// looking up and updating the Layer objects for these layer descriptors.
-    pub fn get_visibility(
-        &self,
-        mut read_points: Vec<Lsn>,
-    ) -> (
-        Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
-        KeySpace,
-    ) {
-        // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
-        // KeySpace is intended to be composed statically and iterated over.
-        struct KeyShadow {
-            // Map of range start to range end
-            inner: RangeSetBlaze<i128>,
-        }
-
-        impl KeyShadow {
-            fn new() -> Self {
-                Self {
-                    inner: Default::default(),
-                }
-            }
-
-            fn contains(&self, range: Range<Key>) -> bool {
-                let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
-                self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
-                    CheckSortedDisjoint::from([range_incl]),
-                ))
-            }
-
-            /// Add the input range to the keys covered by self.
-            ///
-            /// Return true if inserting this range covered some keys that were previously not covered
-            fn cover(&mut self, insert: Range<Key>) -> bool {
-                let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
-                self.inner.ranges_insert(range_incl)
-            }
-
-            fn reset(&mut self) {
-                self.inner = Default::default();
-            }
-
-            fn to_keyspace(&self) -> KeySpace {
-                let mut accum = KeySpaceAccum::new();
-                for range_incl in self.inner.ranges() {
-                    let range = Range {
-                        start: Key::from_i128(*range_incl.start()),
-                        end: Key::from_i128(range_incl.end() + 1),
-                    };
-                    accum.add_range(range)
-                }
-
-                accum.to_keyspace()
-            }
-        }
-
-        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
-        // and a ReadPoint
-        read_points.sort_by_key(|rp| rp.0);
-        let mut shadow = KeyShadow::new();
-
-        // We will interleave all our read points and layers into a sorted collection
-        enum Item {
-            ReadPoint { lsn: Lsn },
-            Layer(Arc<PersistentLayerDesc>),
-        }
-
-        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
-        items.extend(self.iter_historic_layers().map(Item::Layer));
-        items.extend(
-            read_points
-                .into_iter()
-                .map(|rp| Item::ReadPoint { lsn: rp }),
-        );
-
-        // Ordering: we want to iterate like this:
-        // 1. Highest LSNs first
-        // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
-        // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
-        items.sort_by_key(|item| {
-            std::cmp::Reverse(match item {
-                Item::Layer(layer) => {
-                    if layer.is_delta() {
-                        (Lsn(layer.get_lsn_range().end.0 - 1), 0)
-                    } else {
-                        (layer.image_layer_lsn(), 1)
-                    }
-                }
-                Item::ReadPoint { lsn } => (*lsn, 2),
-            })
-        });
-
-        let mut results = Vec::with_capacity(self.historic.len());
-
-        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
-
-        for item in items {
-            let (reached_lsn, is_readpoint) = match &item {
-                Item::ReadPoint { lsn } => (lsn, true),
-                Item::Layer(layer) => (&layer.lsn_range.start, false),
-            };
-            maybe_covered_deltas.retain(|d| {
-                if *reached_lsn >= d.lsn_range.start && is_readpoint {
-                    // We encountered a readpoint within the delta layer: it is visible
-
-                    results.push((d.clone(), LayerVisibilityHint::Visible));
-                    false
-                } else if *reached_lsn < d.lsn_range.start {
-                    // We passed the layer's range without encountering a read point: it is not visible
-                    results.push((d.clone(), LayerVisibilityHint::Covered));
-                    false
-                } else {
-                    // We're still in the delta layer: continue iterating
-                    true
-                }
-            });
-
-            match item {
-                Item::ReadPoint { lsn: _lsn } => {
-                    // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
-                    // to assume that the whole key range is visible at the branch point.
-                    shadow.reset();
-                }
-                Item::Layer(layer) => {
-                    let visibility = if layer.is_delta() {
-                        if shadow.contains(layer.get_key_range()) {
-                            // If a layer isn't visible based on current state, we must defer deciding whether
-                            // it is truly not visible until we have advanced past the delta's range: we might
-                            // encounter another branch point within this delta layer's LSN range.
-                            maybe_covered_deltas.push(layer);
-                            continue;
-                        } else {
-                            LayerVisibilityHint::Visible
-                        }
-                    } else {
-                        let modified = shadow.cover(layer.get_key_range());
-                        if modified {
-                            // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
-                            LayerVisibilityHint::Visible
-                        } else {
-                            // An image layer in a region that was already covered
-                            LayerVisibilityHint::Covered
-                        }
-                    };
-
-                    results.push((layer, visibility));
-                }
-            }
-        }
-
-        // Drain any remaining maybe_covered deltas
-        results.extend(
-            maybe_covered_deltas
-                .into_iter()
-                .map(|d| (d, LayerVisibilityHint::Covered)),
-        );
-
-        (results, shadow.to_keyspace())
-    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::tenant::{storage_layer::LayerName, IndexPart};
-    use pageserver_api::{
-        key::DBDIR_KEY,
-        keyspace::{KeySpace, KeySpaceRandomAccum},
-    };
-    use std::{collections::HashMap, path::PathBuf};
-    use utils::{
-        id::{TenantId, TimelineId},
-        shard::TenantShardId,
-    };
+    use pageserver_api::keyspace::KeySpace;

    use super::*;

@@ -1175,299 +1002,4 @@ mod tests {
            }
        }
    }
-
-    #[test]
-    fn layer_visibility_basic() {
-        // A simple synthetic input, as a smoke test.
-        let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
-        let timeline_id = TimelineId::generate();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-
-        const FAKE_LAYER_SIZE: u64 = 1024;
-
-        let inject_delta = |updates: &mut BatchedUpdates,
-                            key_start: i128,
-                            key_end: i128,
-                            lsn_start: u64,
-                            lsn_end: u64| {
-            let desc = PersistentLayerDesc::new_delta(
-                tenant_shard_id,
-                timeline_id,
-                Range {
-                    start: Key::from_i128(key_start),
-                    end: Key::from_i128(key_end),
-                },
-                Range {
-                    start: Lsn(lsn_start),
-                    end: Lsn(lsn_end),
-                },
-                1024,
-            );
-            updates.insert_historic(desc.clone());
-            desc
-        };
-
-        let inject_image =
-            |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
-                let desc = PersistentLayerDesc::new_img(
-                    tenant_shard_id,
-                    timeline_id,
-                    Range {
-                        start: Key::from_i128(key_start),
-                        end: Key::from_i128(key_end),
-                    },
-                    Lsn(lsn),
-                    FAKE_LAYER_SIZE,
-                );
-                updates.insert_historic(desc.clone());
-                desc
-            };
-
-        //
-        // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
-        // we expect to handle.  You can follow these examples through in the same order as they would be processed
-        // by the function under test.
-        //
-
-        let mut read_points = vec![Lsn(1000)];
-
-        // A delta ahead of any image layer
-        let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
-
-        // An image layer is visible and covers some layers beneath itself
-        let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
-
-        // A delta layer covered by the image layer: should be covered
-        let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
-
-        // A delta layer partially covered by an image layer: should be visible
-        let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
-
-        // A delta layer not covered by an image layer: should be visible
-        let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
-
-        // An image layer covered by the image layer above: should be covered
-        let covered_image = inject_image(&mut updates, 10, 20, 89);
-
-        // An image layer partially covered by an image layer: should be visible
-        let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
-
-        // An image layer not covered by an image layer: should be visible
-        let not_covered_image = inject_image(&mut updates, 1, 4, 89);
-
-        // A read point: this will make subsequent layers below here visible, even if there are
-        // more recent layers covering them.
-        read_points.push(Lsn(80));
-
-        // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
-        let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
-
-        // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
-        // the read point should make it visible, even though its end LSN is covered
-        let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
-        let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
-        read_points.push(Lsn(65));
-        let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
-
-        let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
-
-        updates.flush();
-
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        assert_eq!(
-            layer_visibilities.get(&ahead_layer),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_covering_img),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_image),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_below_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covering_img_between_read_points),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_between_read_points),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_intersects_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_img_after_last_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-
-        // Shadow should include all the images below the last read point
-        let expected_shadow = KeySpace {
-            ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
-        };
-        assert_eq!(shadow, expected_shadow);
-    }
-
-    fn fixture_path(relative: &str) -> PathBuf {
-        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-    }
-
-    #[test]
-    fn layer_visibility_realistic() {
-        // Load a large example layermap
-        let index_raw = std::fs::read_to_string(fixture_path(
-            "test_data/indices/mixed_workload/index_part.json",
-        ))
-        .unwrap();
-        let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
-
-        let tenant_id = TenantId::generate();
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-        let timeline_id = TimelineId::generate();
-
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for (layer_name, layer_metadata) in index.layer_metadata {
-            let layer_desc = match layer_name {
-                LayerName::Image(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range.clone(),
-                    lsn_range: layer_name.lsn_as_range(),
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: false,
-                    file_size: layer_metadata.file_size,
-                },
-                LayerName::Delta(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range,
-                    lsn_range: layer_name.lsn_range,
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: true,
-                    file_size: layer_metadata.file_size,
-                },
-            };
-            updates.insert_historic(layer_desc);
-        }
-        updates.flush();
-
-        let read_points = vec![index.metadata.disk_consistent_lsn()];
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        for (layer_desc, visibility) in &layer_visibilities {
-            tracing::info!("{layer_desc:?}: {visibility:?}");
-            eprintln!("{layer_desc:?}: {visibility:?}");
-        }
-
-        // The shadow should be non-empty, since there were some image layers
-        assert!(!shadow.ranges.is_empty());
-
-        // At least some layers should be marked covered
-        assert!(layer_visibilities
-            .iter()
-            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
-
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
-        for (layer_desc, visible) in &layer_visibilities {
-            let mut coverage = KeySpaceRandomAccum::new();
-            let mut covered_by = Vec::new();
-
-            for other_layer in layer_map.iter_historic_layers() {
-                if &other_layer == layer_desc {
-                    continue;
-                }
-                if !other_layer.is_delta()
-                    && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
-                    && other_layer.key_range.start <= layer_desc.key_range.end
-                    && layer_desc.key_range.start <= other_layer.key_range.end
-                {
-                    coverage.add_range(other_layer.get_key_range());
-                    covered_by.push((*other_layer).clone());
-                }
-            }
-            let coverage = coverage.to_keyspace();
-
-            let expect_visible = if coverage.ranges.len() == 1
-                && coverage.contains(&layer_desc.key_range.start)
-                && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
-            {
-                LayerVisibilityHint::Covered
-            } else {
-                LayerVisibilityHint::Visible
-            };
-
-            if expect_visible != *visible {
-                eprintln!(
-                    "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
-                    layer_desc.key_range.start,
-                    layer_desc.key_range.end,
-                    layer_desc.lsn_range.start,
-                    layer_desc.lsn_range.end,
-                    layer_desc.is_delta()
-                );
-                if expect_visible == LayerVisibilityHint::Covered {
-                    eprintln!("Covered by:");
-                    for other in covered_by {
-                        eprintln!(
-                            "  {}..{} @ {}",
-                            other.get_key_range().start,
-                            other.get_key_range().end,
-                            other.image_layer_lsn()
-                        );
-                    }
-                    if let Some(range) = coverage.ranges.first() {
-                        eprintln!(
-                            "Total coverage from contributing layers: {}..{}",
-                            range.start, range.end
-                        );
-                    } else {
-                        eprintln!(
-                            "Total coverage from contributing layers: {:?}",
-                            coverage.ranges
-                        );
-                    }
-                }
-            }
-            assert_eq!(expect_visible, *visible);
-        }
-
-        // Sanity: the layer that holds latest data for the DBDIR key should always be visible
-        // (just using this key as a key that will always exist for any layermap fixture)
-        let dbdir_layer = layer_map
-            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
-            .unwrap();
-        assert!(matches!(
-            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
-            LayerVisibilityHint::Visible
-        ));
-    }
 }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,10 +521,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {

        Ok(&self.historic_coverage)
    }
-
-    pub(crate) fn len(&self) -> usize {
-        self.layers.len()
-    }
 }

 #[test]
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
 #[error("re-serializing for crc32 failed")]
 struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);

-const METADATA_HDR_SIZE: usize = size_of::<TimelineMetadataHeader>();
+const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 struct TimelineMetadataBodyV2 {
@@ -285,21 +285,30 @@ impl TimelineMetadata {
    }

    /// When reparenting, the `ancestor_lsn` does not change.
-    pub fn reparent(&mut self, timeline: &TimelineId) {
+    ///
+    /// Returns true if anything was changed.
+    pub fn reparent(&mut self, timeline: &TimelineId) -> bool {
        assert!(self.body.ancestor_timeline.is_some());
        // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
+        let prev = self.body.ancestor_timeline;
        self.body.ancestor_timeline = Some(*timeline);
+        prev.as_ref() != Some(timeline)
    }

-    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
+    /// Returns true if anything was changed
+    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
+        let mut changed = false;
        if let Some(ancestor) = self.body.ancestor_timeline {
            assert_eq!(ancestor, branchpoint.0);
+            changed = true;
        }
        if self.body.ancestor_lsn != Lsn(0) {
            assert_eq!(self.body.ancestor_lsn, branchpoint.1);
+            changed = true;
        }
        self.body.ancestor_timeline = None;
        self.body.ancestor_lsn = Lsn(0);
+        changed
    }

    pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -54,8 +54,8 @@ use utils::id::{TenantId, TimelineId};

 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::PreparedTimelineDetach;
-use super::{GlobalShutDown, TenantSharedResources};
+use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
+use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
    /// Pick the shard that holds this key
    Page(Key),
    /// The shard ID is known: pick the given shard
@@ -665,20 +667,18 @@ pub async fn init_tenant_mgr(
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
-                tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                )
-                .expect("global shutdown during init_tenant_mgr cannot happen"),
-            ),
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                None,
+                &ctx,
+            )),
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -725,8 +725,9 @@ fn tenant_spawn(
    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
+    existing_detach_attempt: Option<&detach_ancestor::Attempt>,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Arc<Tenant> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -745,6 +746,7 @@ fn tenant_spawn(
        shard_identity,
        init_order,
        mode,
+        existing_detach_attempt,
        ctx,
    )
 }
@@ -1192,11 +1194,9 @@ impl TenantManager {
                    shard_identity,
                    None,
                    spawn_mode,
+                    None,
                    ctx,
-                )
-                .map_err(|_: GlobalShutDown| {
-                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
-                })?;
+                );

                TenantSlot::Attached(tenant)
            }
@@ -1316,8 +1316,9 @@ impl TenantManager {
            shard_identity,
            None,
            SpawnMode::Eager,
+            None,
            ctx,
-        )?;
+        );

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -1388,32 +1389,34 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
    ) -> Result<(), DeleteTenantError> {
        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            Some(&remote_path),
-            remote_storage::ListingMode::NoDelimiter,
-            None,
-            &self.cancel,
-        );
-        while let Some(chunk) = keys_stream.next().await {
-            let keys = match chunk {
-                Ok(listing) => listing.keys,
-                Err(remote_storage::DownloadError::Cancelled) => {
-                    return Err(DeleteTenantError::Cancelled)
-                }
-                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            };
-
-            if keys.is_empty() {
-                tracing::info!("Remote storage already deleted");
-            } else {
-                tracing::info!("Deleting {} keys from remote storage", keys.len());
-                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-                self.resources
-                    .remote_storage
-                    .delete_objects(&keys, &self.cancel)
-                    .await?;
+        let keys = match self
+            .resources
+            .remote_storage
+            .list(
+                Some(&remote_path),
+                remote_storage::ListingMode::NoDelimiter,
+                None,
+                &self.cancel,
+            )
+            .await
+        {
+            Ok(listing) => listing.keys,
+            Err(remote_storage::DownloadError::Cancelled) => {
+                return Err(DeleteTenantError::Cancelled)
            }
+            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+        };
+
+        if keys.is_empty() {
+            tracing::info!("Remote storage already deleted");
+        } else {
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+            self.resources
+                .remote_storage
+                .delete_objects(&keys, &self.cancel)
+                .await?;
        }

        Ok(())
@@ -1970,8 +1973,10 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        prepared: PreparedTimelineDetach,
+        mut attempt: detach_ancestor::Attempt,
        ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        // FIXME: this is unnecessary, slotguard already has these semantics
        struct RevertOnDropSlot(Option<SlotGuard>);

        impl Drop for RevertOnDropSlot {
@@ -2019,43 +2024,66 @@ impl TenantManager {

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let reparented = timeline
-            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
+        let resp = timeline
+            .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
            .await?;

        let mut slot_guard = slot_guard.into_inner();

-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
-            }
-            Err(_barrier) => {
-                slot_guard.revert();
-                // this really should not happen, at all, unless shutdown was already going?
-                anyhow::bail!("Cannot restart Tenant, already shutting down");
+        let tenant = if resp.reset_tenant_required() {
+            attempt.before_shutdown();
+
+            let (_guard, progress) = utils::completion::channel();
+            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                Ok(()) => {
+                    slot_guard.drop_old_value()?;
+                }
+                Err(_barrier) => {
+                    slot_guard.revert();
+                    // this really should not happen, at all, unless shutdown was already going?
+                    anyhow::bail!("Cannot restart Tenant, already shutting down");
+                }
            }
+
+            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+            let shard_identity = config.shard;
+            let tenant = tenant_spawn(
+                self.conf,
+                tenant_shard_id,
+                &tenant_path,
+                self.resources.clone(),
+                AttachedTenantConf::try_from(config)?,
+                shard_identity,
+                None,
+                SpawnMode::Eager,
+                Some(&attempt),
+                ctx,
+            );
+
+            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
+            tenant
+        } else {
+            tracing::info!("skipping tenant_reset as no changes made required it");
+            tenant
+        };
+
+        if let Some(reparented) = resp.completed() {
+            // finally ask the restarted tenant to complete the detach
+            tenant
+                .ongoing_timeline_detach
+                .complete(attempt, &tenant)
+                .await?;
+            Ok(reparented)
+        } else {
+            // at least the latest versions have now been downloaded and refreshed; be ready to
+            // retry another time.
+            tenant.ongoing_timeline_detach.cancel(attempt);
+            Err(anyhow::anyhow!(
+                "failed to reparent all candidate timelines, please retry"
+            ))
        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            SpawnMode::Eager,
-            ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(reparented)
    }

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
@@ -2092,6 +2120,7 @@ impl TenantManager {
                    };

                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return ShardResolveResult::Found(tenant.clone())
                        }
@@ -2173,9 +2202,6 @@ pub(crate) enum GetActiveTenantError {
    /// never happen.
    #[error("Tenant is broken: {0}")]
    Broken(String),
-
-    #[error("reconnect to switch tenant id")]
-    SwitchedTenant,
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -187,7 +187,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};

 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -457,17 +457,6 @@ impl RemoteTimelineClient {
            .unwrap_or(false)
    }

-    /// Returns whether the timeline is archived.
-    /// Return None if the remote index_part hasn't been downloaded yet.
-    pub(crate) fn is_archived(&self) -> Option<bool> {
-        self.upload_queue
-            .lock()
-            .unwrap()
-            .initialized_mut()
-            .map(|q| q.clean.0.archived_at.is_some())
-            .ok()
-    }
-
    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
            current_remote_index_part
@@ -628,7 +617,7 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
+    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
        self: &Arc<Self>,
        last_aux_file_policy: Option<AuxFilePolicy>,
@@ -639,48 +628,6 @@ impl RemoteTimelineClient {
        self.schedule_index_upload(upload_queue)?;
        Ok(())
    }
-
-    /// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
-    ///
-    /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
-    /// so either if the change is already sitting in the queue, but not commited yet, or the change has not
-    /// been in the queue yet.
-    pub(crate) fn schedule_index_upload_for_timeline_archival_state(
-        self: &Arc<Self>,
-        state: TimelineArchivalState,
-    ) -> anyhow::Result<bool> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        /// Returns Some(_) if a change is needed, and Some(true) if it's a
-        /// change needed to set archived_at.
-        fn need_change(
-            archived_at: &Option<NaiveDateTime>,
-            state: TimelineArchivalState,
-        ) -> Option<bool> {
-            match (archived_at, state) {
-                (Some(_), TimelineArchivalState::Archived)
-                | (None, TimelineArchivalState::Unarchived) => {
-                    // Nothing to do
-                    tracing::info!("intended state matches present state");
-                    None
-                }
-                (None, TimelineArchivalState::Archived) => Some(true),
-                (Some(_), TimelineArchivalState::Unarchived) => Some(false),
-            }
-        }
-        let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state);
-
-        if let Some(archived_at_set) = need_upload_scheduled {
-            let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc());
-            upload_queue.dirty.archived_at = intended_archived_at;
-            self.schedule_index_upload(upload_queue)?;
-        }
-
-        let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some();
-        Ok(need_wait)
-    }
-
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -736,12 +683,13 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Reparent this timeline to a new parent.
+    ///
+    /// A retryable step of timeline ancestor detach.
    pub(crate) async fn schedule_reparenting_and_wait(
        self: &Arc<Self>,
        new_parent: &TimelineId,
    ) -> anyhow::Result<()> {
-        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
-        // and reads the in-memory part we cannot do the detaching like this
        let receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
@@ -752,17 +700,29 @@ impl RemoteTimelineClient {
                ));
            };

-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            let uploaded = &upload_queue.clean.0.metadata;

-            self.schedule_index_upload(upload_queue)?;
+            if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
+                // nothing to do
+                None
+            } else {
+                let mut modified = false;

-            self.schedule_barrier0(upload_queue)
+                modified |= upload_queue.dirty.metadata.reparent(new_parent);
+                modified |= upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+
+                if modified {
+                    self.schedule_index_upload(upload_queue)?;
+                }
+
+                Some(self.schedule_barrier0(upload_queue))
+            }
        };

-        Self::wait_completion0(receiver)
-            .await
-            .context("wait completion")
+        if let Some(receiver) = receiver {
+            Self::wait_completion0(receiver).await?;
+        }
+        Ok(())
    }

    /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -778,26 +738,121 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
+                None
+            } else {
+                let mut modified = false;
+                modified |= upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+                modified |= upload_queue.dirty.lineage.record_detaching(&adopted);

-            for layer in layers {
-                upload_queue
-                    .dirty
-                    .layer_metadata
-                    .insert(layer.layer_desc().layer_name(), layer.metadata());
+                for layer in layers {
+                    let prev = upload_queue
+                        .dirty
+                        .layer_metadata
+                        .insert(layer.layer_desc().layer_name(), layer.metadata());
+                    modified |= prev.is_none();
+                }
+
+                if modified {
+                    self.schedule_index_upload(upload_queue)?;
+                }
+
+                Some(self.schedule_barrier0(upload_queue))
            }
-
-            self.schedule_index_upload(upload_queue)?;
-
-            let barrier = self.schedule_barrier0(upload_queue);
-            self.launch_queued_tasks(upload_queue);
-            barrier
        };

-        Self::wait_completion0(barrier)
-            .await
-            .context("wait completion")
+        if let Some(barrier) = barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
+    }
+
+    /// Marks timeline detach ancestor started for this timeline if it has not been marked as
+    /// started.
+    ///
+    /// A retryable step o ftimeline detach ancestor.
+    ///
+    /// Does not overwrite or even error if the set of reparentable timelines differes. Those can
+    /// be inspected later.
+    ///
+    /// Waits until the completion of the upload.
+    pub(crate) async fn schedule_started_detach_ancestor_mark_and_wait(
+        self: &Arc<Self>,
+    ) -> anyhow::Result<()> {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            fn wanted(x: Option<&index::GcBlocking>) -> bool {
+                x.is_some_and(|b| b.blocked_by_detach_ancestor())
+            }
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                _ => {
+                    // at this point, the metadata must always show that there is a parent
+                    if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
+                        panic!("cannot start detach ancestor if there is nothing to detach from");
+                    }
+                    upload_queue.dirty.gc_blocking = current
+                        .map(|x| x.with_detach_ancestor())
+                        .or_else(|| Some(index::GcBlocking::started_now_for_detach_ancestor()));
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        if let Some(barrier) = maybe_barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
+    }
+
+    /// Marks timeline detach ancestor completed for this timeline if it has not been marked as
+    /// such already.
+    ///
+    /// ## Panics
+    ///
+    /// If the timeline has not been detached from ancestor already.
+    pub(crate) async fn schedule_completed_detach_ancestor_mark_and_wait(
+        self: &Arc<Self>,
+    ) -> anyhow::Result<()> {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            assert!(upload_queue.clean.0.lineage.is_detached_from_ancestor());
+
+            fn wanted(x: Option<&index::GcBlocking>) -> bool {
+                x.is_none() || x.is_some_and(|b| !b.blocked_by_detach_ancestor())
+            }
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                _ => {
+                    upload_queue.dirty.gc_blocking = current
+                        .expect("has to be Some because of wanted()")
+                        .without_detach_ancestor();
+                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        if let Some(barrier) = maybe_barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
    }

    /// Launch an upload operation in the background; the file is added to be included in next
@@ -1378,18 +1433,6 @@ impl RemoteTimelineClient {
                .dirty
                .layer_metadata
                .drain()
-                .filter(|(_file_name, meta)| {
-                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
-                    // all shards anyway, we _could_ delete these, but
-                    // - it creates a potential race if other shards are still
-                    //   using the layers while this shard deletes them.
-                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
-                    //   these timelines are present but corrupt (their index exists but some layers don't)
-                    //
-                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
-                    meta.shard.shard_number == self.tenant_shard_id.shard_number
-                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
-                })
                .map(|(file_name, meta)| {
                    remote_layer_path(
                        &self.tenant_shard_id.tenant_id,
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -32,10 +32,6 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub archived_at: Option<NaiveDateTime>,
-
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -60,6 +56,9 @@ pub struct IndexPart {
    #[serde(default)]
    pub(crate) lineage: Lineage,

+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) gc_blocking: Option<GcBlocking>,
+
    /// Describes the kind of aux files stored in the timeline.
    ///
    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -84,7 +83,7 @@ impl IndexPart {
    /// - 5: lineage was added
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
-    /// - 8: added `archived_at`
+    /// - 8: +gc_blocking
    const LATEST_VERSION: usize = 8;

    // Versions we may see when reading from a bucket.
@@ -99,8 +98,8 @@ impl IndexPart {
            disk_consistent_lsn: metadata.disk_consistent_lsn(),
            metadata,
            deleted_at: None,
-            archived_at: None,
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        }
    }
@@ -211,26 +210,45 @@ fn is_false(b: &bool) -> bool {
 impl Lineage {
    const REMEMBER_AT_MOST: usize = 100;

-    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
        if self.reparenting_history.last() == Some(old_ancestor) {
            // do not re-record it
-            return;
-        }
+            false
+        } else {
+            #[cfg(feature = "testing")]
+            {
+                let existing = self
+                    .reparenting_history
+                    .iter()
+                    .position(|x| x == old_ancestor);
+                assert_eq!(
+                    existing, None,
+                    "we cannot reparent onto and off and onto the same timeline twice"
+                );
+            }
+            let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;

-        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
-
-        self.reparenting_history_truncated |= drop_oldest;
-        if drop_oldest {
-            self.reparenting_history.remove(0);
+            self.reparenting_history_truncated |= drop_oldest;
+            if drop_oldest {
+                self.reparenting_history.remove(0);
+            }
+            self.reparenting_history.push(*old_ancestor);
+            true
        }
-        self.reparenting_history.push(*old_ancestor);
    }

-    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
-        assert!(self.original_ancestor.is_none());
-
-        self.original_ancestor =
-            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    /// Returns true if anything changed.
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
+        if let Some((id, lsn, _)) = self.original_ancestor {
+            assert_eq!(id, branchpoint.0);
+            assert_eq!(lsn, branchpoint.1);
+            false
+        } else {
+            assert!(self.original_ancestor.is_none());
+            self.original_ancestor =
+                Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+            true
+        }
    }

    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
@@ -242,15 +260,53 @@ impl Lineage {
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }

-    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+    /// Returns true if the timeline originally had an ancestor, and no longer has one.
+    pub(crate) fn is_detached_from_ancestor(&self) -> bool {
        self.original_ancestor.is_some()
    }

+    /// Returns original ancestor timeline id and lsn that this timeline has been detached from.
+    pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
+        self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
+    }
+
    pub(crate) fn is_reparented(&self) -> bool {
        !self.reparenting_history.is_empty()
    }
 }

+/// Right now, the only reason to block gc persistently is detach_ancestor. To use gc blocking more
+/// broadly, a reason set field needs to be added, and the shared state load time building be
+/// complicated to avoid detach_ancestor clearing out a manually configured gc blocking.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct GcBlocking {
+    pub(crate) started_at: NaiveDateTime,
+}
+
+impl GcBlocking {
+    pub(super) fn started_now_for_detach_ancestor() -> Self {
+        GcBlocking {
+            started_at: chrono::Utc::now().naive_utc(),
+        }
+    }
+
+    /// Returns true if detach_ancestor is one of the reasons why the gc is blocked.
+    pub(crate) fn blocked_by_detach_ancestor(&self) -> bool {
+        true
+    }
+
+    /// Returns a version of self with the reason of detach_ancestor.
+    pub(super) fn with_detach_ancestor(&self) -> Self {
+        self.clone()
+    }
+
+    /// Returns a version of self without the reason of detach_ancestor. Assumption is that if
+    /// there are no more reasons, we can unblock the gc.
+    pub(super) fn without_detach_ancestor(&self) -> Option<Self> {
+        None
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -290,8 +346,8 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -333,8 +389,8 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -377,8 +433,8 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -424,8 +480,8 @@ mod tests {
            ])
            .unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -466,8 +522,8 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -507,12 +563,12 @@ mod tests {
            disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
            metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage {
                reparenting_history_truncated: false,
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -557,12 +613,12 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Lineage {
                reparenting_history_truncated: false,
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
+            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

@@ -616,8 +672,8 @@ mod tests {
                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -643,8 +699,9 @@ mod tests {
                "initdb_lsn": "0/1696070",
                "pg_version": 14
            },
-            "deleted_at": "2023-07-31T09:00:00.123",
-            "archived_at": "2023-04-29T09:00:00.123"
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123"
+            }
        }"#;

        let expected = IndexPart {
@@ -671,9 +728,11 @@ mod tests {
                Lsn::from_str("0/1696070").unwrap(),
                14,
            ).with_recalculated_checksum().unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
+            deleted_at: None,
            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+            }),
            last_aux_file_policy: Default::default(),
        };

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,9 +8,6 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-#[cfg(test)]
-pub mod split_writer;
-
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
@@ -454,14 +451,20 @@ pub enum ValueReconstructResult {
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum LayerVisibilityHint {
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
+pub(crate) enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
    /// and a readable LSN (the tip of the branch or a child's branch point)
    Visible,
    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
+    #[allow(unused)]
    Covered,
+    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
+    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
+    /// state is for when existing layers are constructed while loading a timeline.
+    #[default]
+    Uninitialized,
 }

 pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -623,29 +626,22 @@ impl LayerAccessStats {
        }
    }

-    /// Helper for extracting the visibility hint from the literal value of our inner u64
-    fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
-        match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
-            1 => LayerVisibilityHint::Visible,
-            0 => LayerVisibilityHint::Covered,
-            _ => unreachable!(),
-        }
-    }
-
-    /// Returns the old value which has been replaced
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
        let value = match visibility {
            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
-            LayerVisibilityHint::Covered => 0x0,
+            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
        };

-        let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
-        self.decode_visibility(old_bits)
+        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
    }

    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-        self.decode_visibility(read)
+        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+            1 => LayerVisibilityHint::Visible,
+            0 => LayerVisibilityHint::Covered,
+            _ => unreachable!(),
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -307,10 +307,12 @@ impl DeltaLayer {
            .with_context(|| format!("Failed to load delta layer {}", self.path()))
    }

-    async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result<Arc<DeltaLayerInner>> {
+    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?;
+        let loaded = DeltaLayerInner::load(&path, None, None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -467,7 +469,7 @@ impl DeltaLayerWriterInner {
            .write_blob_maybe_compressed(val, ctx, compression)
            .await;
        let off = match res {
-            Ok((off, _)) => off,
+            Ok(off) => off,
            Err(e) => return (val, Err(anyhow::anyhow!(e))),
        };

@@ -758,24 +760,27 @@ impl DeltaLayerInner {
        &self.layer_lsn_range
    }

+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        summary: Option<Summary>,
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
-            .await
-            .context("open layer file")?;
-
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path, ctx).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file_id = page_cache::next_file_id();

        let block_reader = FileBlockReader::new(&file, file_id);

-        let summary_blk = block_reader
-            .read_blk(0, ctx)
-            .await
-            .context("read first block")?;
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };

        // TODO: this should be an assertion instead; see ImageLayerInner::load
        let actual_summary =
@@ -797,7 +802,7 @@ impl DeltaLayerInner {
            }
        }

-        Ok(DeltaLayerInner {
+        Ok(Ok(DeltaLayerInner {
            file,
            file_id,
            index_start_blk: actual_summary.index_start_blk,
@@ -805,7 +810,7 @@ impl DeltaLayerInner {
            max_vectored_read_bytes,
            layer_key_range: actual_summary.key_range,
            layer_lsn_range: actual_summary.lsn_range,
-        })
+        }))
    }

    pub(super) async fn get_value_reconstruct_data(
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -265,8 +265,9 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -384,16 +385,17 @@ impl ImageLayerInner {
        summary: Option<Summary>,
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
-            .await
-            .context("open layer file")?;
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path, ctx).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file_id = page_cache::next_file_id();
        let block_reader = FileBlockReader::new(&file, file_id);
-        let summary_blk = block_reader
-            .read_blk(0, ctx)
-            .await
-            .context("read first block")?;
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };

        // length is the only way how this could fail, so it's not actually likely at all unless
        // read_blk returns wrong sized block.
@@ -418,7 +420,7 @@ impl ImageLayerInner {
            }
        }

-        Ok(ImageLayerInner {
+        Ok(Ok(ImageLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            lsn,
@@ -426,7 +428,7 @@ impl ImageLayerInner {
            file_id,
            max_vectored_read_bytes,
            key_range: actual_summary.key_range,
-        })
+        }))
    }

    pub(super) async fn get_value_reconstruct_data(
@@ -734,22 +736,8 @@ struct ImageLayerWriterInner {
    // Total uncompressed bytes passed into put_image
    uncompressed_bytes: u64,

-    // Like `uncompressed_bytes`,
-    // but only of images we might consider for compression
-    uncompressed_bytes_eligible: u64,
-
-    // Like `uncompressed_bytes`, but only of images
-    // where we have chosen their compressed form
-    uncompressed_bytes_chosen: u64,
-
-    // Number of keys in the layer.
-    num_keys: usize,
-
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
-
-    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
-    last_written_key: Key,
 }

 impl ImageLayerWriterInner {
@@ -804,10 +792,6 @@ impl ImageLayerWriterInner {
            tree: tree_builder,
            blob_writer,
            uncompressed_bytes: 0,
-            uncompressed_bytes_eligible: 0,
-            uncompressed_bytes_chosen: 0,
-            num_keys: 0,
-            last_written_key: Key::MIN,
        };

        Ok(writer)
@@ -826,33 +810,18 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let compression = self.conf.image_compression;
-        let uncompressed_len = img.len() as u64;
-        self.uncompressed_bytes += uncompressed_len;
-        self.num_keys += 1;
+        self.uncompressed_bytes += img.len() as u64;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
            .await;
        // TODO: re-use the buffer for `img` further upstack
-        let (off, compression_info) = res?;
-        if compression_info.compressed_size.is_some() {
-            // The image has been considered for compression at least
-            self.uncompressed_bytes_eligible += uncompressed_len;
-        }
-        if compression_info.written_compressed {
-            // The image has been compressed
-            self.uncompressed_bytes_chosen += uncompressed_len;
-        }
+        let off = res?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
        self.tree.append(&keybuf, off)?;

-        #[cfg(feature = "testing")]
-        {
-            self.last_written_key = key;
-        }
-
        Ok(())
    }

@@ -863,7 +832,6 @@ impl ImageLayerWriterInner {
        self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        end_key: Option<Key>,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -871,9 +839,6 @@ impl ImageLayerWriterInner {
        // Calculate compression ratio
        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
-            .inc_by(self.uncompressed_bytes_eligible);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);

        let mut file = self.blob_writer.into_inner();
@@ -914,23 +879,11 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            self.key_range.clone(),
            self.lsn,
            metadata.len(),
        );

-        #[cfg(feature = "testing")]
-        if let Some(end_key) = end_key {
-            assert!(
-                self.last_written_key < end_key,
-                "written key violates end_key range"
-            );
-        }
-
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -1007,18 +960,6 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

-    #[cfg(test)]
-    /// Estimated size of the image layer.
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
    ///
    /// Finish writing the image layer.
    ///
@@ -1027,22 +968,7 @@ impl ImageLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
-    }
-
-    #[cfg(test)]
-    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
-    pub(super) async fn finish_with_end_key(
-        mut self,
-        timeline: &Arc<Timeline>,
-        end_key: Key,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
+        self.inner.take().unwrap().finish(timeline, ctx).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
-    ValuesReconstructState,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -247,7 +246,7 @@ impl Layer {
                &timeline.generation,
            );

-            LayerInner::new(
+            let layer = LayerInner::new(
                conf,
                timeline,
                local_path,
@@ -255,7 +254,14 @@ impl Layer {
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
-            )
+            );
+
+            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
+            layer
+                .access_stats
+                .set_visibility(super::LayerVisibilityHint::Visible);
+
+            layer
        }));

        let downloaded = resident.expect("just initialized");
@@ -487,32 +493,6 @@ impl Layer {
            }
        }
    }
-
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.access_stats().set_visibility(visibility.clone());
-        use LayerVisibilityHint::*;
-        match (old_visibility, visibility) {
-            (Visible, Covered) => {
-                // Subtract this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .sub(self.0.desc.file_size)
-                }
-            }
-            (Covered, Visible) => {
-                // Add this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .add(self.0.desc.file_size)
-                }
-            }
-            (Covered, Covered) | (Visible, Visible) => {
-                // no change
-            }
-        }
-    }
 }

 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -713,13 +693,6 @@ impl Drop for LayerInner {
                timeline.metrics.layer_count_image.dec();
                timeline.metrics.layer_size_image.sub(self.desc.file_size);
            }
-
-            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
-                timeline
-                    .metrics
-                    .visible_physical_size_gauge
-                    .sub(self.desc.file_size);
-            }
        }

        if !*self.wanted_deleted.get_mut() {
@@ -828,12 +801,6 @@ impl LayerInner {
            timeline.metrics.layer_size_image.add(desc.file_size);
        }

-        // New layers are visible by default. This metric is later updated on drop or in set_visibility
-        timeline
-            .metrics
-            .visible_physical_size_gauge
-            .add(desc.file_size);
-
        LayerInner {
            conf,
            debug_str: {
@@ -1684,9 +1651,8 @@ impl Drop for DownloadedLayer {
 }

 impl DownloadedLayer {
-    /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
-    /// Failure to load the layer is sticky, i.e., future `get()` calls will return
-    /// the initial load failure immediately.
+    /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
+    /// initialize it permanently.
    ///
    /// `owner` parameter is a strong reference at the same `LayerInner` as the
    /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
@@ -1717,7 +1683,7 @@ impl DownloadedLayer {
                    ctx,
                )
                .await
-                .map(LayerKind::Delta)
+                .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
@@ -1734,29 +1700,32 @@ impl DownloadedLayer {
                    ctx,
                )
                .await
-                .map(LayerKind::Image)
+                .map(|res| res.map(LayerKind::Image))
            };

            match res {
-                Ok(layer) => Ok(layer),
-                Err(err) => {
+                Ok(Ok(layer)) => Ok(Ok(layer)),
+                Ok(Err(transient)) => Err(transient),
+                Err(permanent) => {
                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
-                    // We log this message once over the lifetime of `Self`
-                    // => Ok and good to log backtrace and path here.
-                    tracing::error!(
-                        "layer load failed, assuming permanent failure: {}: {err:?}",
-                        owner.path
-                    );
-                    Err(err)
+                    // TODO(#5815): we are not logging all errors, so temporarily log them **once**
+                    // here as well
+                    let permanent = permanent.context("load layer");
+                    tracing::error!("layer loading failed permanently: {permanent:#}");
+                    Ok(Err(permanent))
                }
            }
        };
        self.kind
-            .get_or_init(init)
-            .await
+            .get_or_try_init(init)
+            // return transient errors using `?`
+            .await?
            .as_ref()
-            // We already logged the full backtrace above, once. Don't repeat that here.
-            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
+            .map_err(|e| {
+                // errors are not clonabled, cannot but stringify
+                // test_broken_timeline matches this string
+                anyhow::anyhow!("layer loading failed: {e:#}")
+            })
    }

    async fn get_value_reconstruct_data(
@@ -1791,11 +1760,7 @@ impl DownloadedLayer {
    ) -> Result<(), GetVectoredError> {
        use LayerKind::*;

-        match self
-            .get(owner, ctx)
-            .await
-            .map_err(GetVectoredError::Other)?
-        {
+        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
            Delta(d) => {
                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
                    .await
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -828,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
 #[test]
 #[cfg(target_arch = "x86_64")]
 fn layer_size() {
-    assert_eq!(size_of::<LayerAccessStats>(), 8);
-    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 312);
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
+    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 312);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -41,20 +41,6 @@ pub struct PersistentLayerKey {
    pub is_delta: bool,
 }

-impl std::fmt::Display for PersistentLayerKey {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}..{} {}..{} is_delta={}",
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta
-        )
-    }
-}
-
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -204,11 +204,9 @@ impl<'a> IteratorWrapper<'a> {
 /// A merge iterator over delta/image layer iterators. When duplicated records are
 /// found, the iterator will not perform any deduplication, and the caller should handle
 /// these situation. By saying duplicated records, there are many possibilities:
-///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
 /// * Delta/image at the same LSN where the image has already applied the delta.
-///
 /// The iterator will always put the image before the delta.
 pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,244 +0,0 @@
-use std::sync::Arc;
-
-use bytes::Bytes;
-use pageserver_api::key::{Key, KEY_SIZE};
-use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
-
-use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
-
-use super::{ImageLayerWriter, ResidentLayer};
-
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
-#[must_use]
-pub struct SplitImageLayerWriter {
-    inner: ImageLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn: Lsn,
-}
-
-impl SplitImageLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn: Lsn,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: ImageLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                &(start_key..Key::MAX),
-                lsn,
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn,
-        })
-    }
-
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is an upper bound of the space that the key/image could take
-        // because we did not consider compression in this estimation. The resulting image layer
-        // could be smaller than the target size.
-        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_image_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                &(key..Key::MAX),
-                self.lsn,
-                ctx,
-            )
-            .await?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
-        }
-        self.inner.put_image(key, img, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
-        Ok(generated_layers)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::AsLayerDesc,
-        },
-        DEFAULT_PG_VERSION,
-    };
-
-    use super::*;
-
-    fn get_key(id: u32) -> Key {
-        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        key.field6 = id;
-        key
-    }
-
-    fn get_img(id: u32) -> Bytes {
-        format!("{id:064}").into()
-    }
-
-    fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
-    }
-
-    #[tokio::test]
-    async fn write_one_image() {
-        let harness = TenantHarness::create("split_writer_write_one_image")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
-        assert_eq!(layers.len(), 1);
-    }
-
-    #[tokio::test]
-    async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        const N: usize = 2000;
-        for i in 0..N {
-            let i = i as u32;
-            writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
-                .await
-                .unwrap();
-        }
-        let layers = writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), N / 512 + 1);
-        for idx in 0..layers.len() {
-            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    layers[idx - 1].layer_desc().key_range.end,
-                    layers[idx].layer_desc().key_range.start
-                );
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn write_large_img() {
-        let harness = TenantHarness::create("split_writer_write_large_img")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        writer
-            .put_image(get_key(1), get_large_img(), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
-        assert_eq!(layers.len(), 2);
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -129,9 +129,11 @@ pub fn start_background_loops(
            let background_jobs_can_start = background_jobs_can_start.cloned();
            async move {
                let cancel = task_mgr::shutdown_token();
+                let can_start = completion::Barrier::maybe_wait(background_jobs_can_start);
+                let can_start = tenant.ongoing_timeline_detach.gc_sleeping_while(can_start);
                tokio::select! {
                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = can_start => {}
                };
                gc_loop(tenant, cancel)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -210,28 +212,24 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                Duration::from_secs(10)
            } else {
                // Run compaction
-                match tenant.compaction_iteration(&cancel, &ctx).await {
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-                        log_compaction_error(
-                            &e,
-                            error_run_count,
-                            &wait_duration,
-                            cancel.is_cancelled(),
-                        );
-                        wait_duration
-                    }
-                    Ok(has_pending_task) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::from_secs(0) } else { period }
-                    }
+                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    log_compaction_error(
+                        &e,
+                        error_run_count,
+                        &wait_duration,
+                        cancel.is_cancelled(),
+                    );
+                    wait_duration
+                } else {
+                    error_run_count = 0;
+                    period
                }
            };

@@ -365,14 +363,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            if first {
                first = false;

-                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
-                    .await
-                    .is_err()
-                {
-                    break;
-                }
+                let delays = async {
+                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
+                    random_init_delay(period, &cancel).await?;
+                    Ok::<_, Cancelled>(())
+                };

-                if random_init_delay(period, &cancel).await.is_err() {
+                if tenant.ongoing_timeline_detach.gc_sleeping_while(delays).await.is_err() {
                    break;
                }
            }
@@ -408,8 +405,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        let wait_duration = Duration::from_secs_f64(wait_duration);

                        error!(
-                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
-                    );
+                            "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                        );
                        wait_duration
                    }
                }
@@ -418,7 +415,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

            // Sleep
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
+            let cancelled = cancel.cancelled();
+            let cancelled = tenant.ongoing_timeline_detach.gc_sleeping_while(cancelled);
+            if tokio::time::timeout(sleep_duration, cancelled)
                .await
                .is_ok()
            {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,7 +3,6 @@ pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
-pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -18,7 +17,6 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
-use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
@@ -60,7 +58,7 @@ use std::{
    sync::atomic::AtomicU64,
 };
 use std::{
-    cmp::{max, min},
+    cmp::{max, min, Ordering},
    ops::ControlFlow,
 };
 use std::{
@@ -76,7 +74,6 @@ use crate::{
        metadata::TimelineMetadata,
        storage_layer::PersistentLayerDesc,
    },
-    walredo,
 };
 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -143,10 +140,7 @@ use self::walreceiver::{WalReceiver, WalReceiverConf};
 use super::{config::TenantConf, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{
-    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
-    storage_layer::ReadableLayer,
-};
+use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
 use super::{
    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
    GcError,
@@ -183,6 +177,25 @@ impl std::fmt::Display for ImageLayerCreationMode {
    }
 }

+/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct Hole {
+    key_range: Range<Key>,
+    coverage_size: usize,
+}
+
+impl Ord for Hole {
+    fn cmp(&self, other: &Self) -> Ordering {
+        other.coverage_size.cmp(&self.coverage_size) // inverse order
+    }
+}
+
+impl PartialOrd for Hole {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
 fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
@@ -430,8 +443,6 @@ pub struct Timeline {
    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,

    pub(crate) l0_flush_global_state: L0FlushGlobalState,
-
-    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 }

 pub struct WalReceiverInfo {
@@ -537,6 +548,7 @@ impl GetVectoredError {
    }
 }

+#[derive(Debug)]
 pub struct MissingKeyError {
    key: Key,
    shard: ShardNumber,
@@ -547,12 +559,6 @@ pub struct MissingKeyError {
    backtrace: Option<std::backtrace::Backtrace>,
 }

-impl std::fmt::Debug for MissingKeyError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self)
-    }
-}
-
 impl std::fmt::Display for MissingKeyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -1004,10 +1010,7 @@ impl Timeline {
            .for_get_kind(GetKind::Singular)
            .observe(elapsed.as_secs_f64());

-        if cfg!(feature = "testing")
-            && res.is_err()
-            && !matches!(res, Err(PageReconstructError::Cancelled))
-        {
+        if cfg!(feature = "testing") && res.is_err() {
            // it can only be walredo issue
            use std::fmt::Write;

@@ -1766,14 +1769,13 @@ impl Timeline {
        }
    }

-    /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
-    /// compaction tasks.
+    /// Outermost timeline compaction operation; downloads needed layers.
    pub(crate) async fn compact(
        self: &Arc<Self>,
        cancel: &CancellationToken,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<(), CompactionError> {
        // most likely the cancellation token is from background task, but in tests it could be the
        // request task as well.

@@ -1793,8 +1795,8 @@ impl Timeline {
        // compaction task goes over it's period (20s) which is quite often in production.
        let (_guard, _permit) = tokio::select! {
            tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(false),
-            _ = cancel.cancelled() => return Ok(false),
+            _ = self.cancel.cancelled() => return Ok(()),
+            _ = cancel.cancelled() => return Ok(()),
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -1802,14 +1804,11 @@ impl Timeline {
        // Last record Lsn could be zero in case the timeline was just created
        if !last_record_lsn.is_valid() {
            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(false);
+            return Ok(());
        }

        match self.get_compaction_algorithm_settings().kind {
-            CompactionAlgorithm::Tiered => {
-                self.compact_tiered(cancel, ctx).await?;
-                Ok(false)
-            }
+            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
        }
    }
@@ -1926,9 +1925,6 @@ impl Timeline {
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

-        // Ensure Prevent new page service requests from starting.
-        self.handles.shutdown();
-
        // Transition the remote_client into a state where it's only useful for timeline deletion.
        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
        self.remote_client.stop();
@@ -2001,11 +1997,6 @@ impl Timeline {
        self.current_state() == TimelineState::Active
    }

-    #[allow(unused)]
-    pub(crate) fn is_archived(&self) -> Option<bool> {
-        self.remote_client.is_archived()
-    }
-
    pub(crate) fn is_stopping(&self) -> bool {
        self.current_state() == TimelineState::Stopping
    }
@@ -2454,8 +2445,6 @@ impl Timeline {
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),

                l0_flush_global_state: resources.l0_flush_global_state,
-
-                handles: Default::default(),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2739,10 +2728,6 @@ impl Timeline {
        // Tenant::create_timeline will wait for these uploads to happen before returning, or
        // on retry.

-        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
-        drop(guard); // drop write lock, update_layer_visibility will take a read lock.
-        self.update_layer_visibility().await;
-
        info!(
            "loaded layer map with {} layers at {}, total physical size: {}",
            num_layers, disk_consistent_lsn, total_physical_size
@@ -3436,6 +3421,7 @@ impl Timeline {
        }
    }

+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
@@ -3721,25 +3707,10 @@ impl Timeline {
        Ok(ancestor.clone())
    }

-    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
-        self.ancestor_timeline.clone()
-    }
-
    pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
        &self.shard_identity
    }

-    #[inline(always)]
-    pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
-        ShardTimelineId {
-            shard_index: ShardIndex {
-                shard_number: self.shard_identity.number,
-                shard_count: self.shard_identity.count,
-            },
-            timeline_id: self.timeline_id,
-        }
-    }
-
    ///
    /// Get a handle to the latest layer for appending.
    ///
@@ -4092,21 +4063,6 @@ impl Timeline {
            // release lock on 'layers'
        };

-        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote.
-        self.remote_client
-            .wait_completion()
-            .await
-            .map_err(|e| match e {
-                WaitCompletionError::UploadQueueShutDownOrStopped
-                | WaitCompletionError::NotInitialized(
-                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                ) => FlushLayerError::Cancelled,
-                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                    FlushLayerError::Other(anyhow!(e).into())
-                }
-            })?;
-
        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
@@ -4699,6 +4655,27 @@ impl Timeline {
            }
        }

+        // The writer.finish() above already did the fsync of the inodes.
+        // We just need to fsync the directory in which these inodes are linked,
+        // which we know to be the timeline directory.
+        if !image_layers.is_empty() {
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
+
        let mut guard = self.layers.write().await;

        // FIXME: we could add the images to be uploaded *before* returning from here, but right
@@ -4707,9 +4684,6 @@ impl Timeline {
        drop_wlock(guard);
        timer.stop_and_record();

-        // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await;
-
        Ok(image_layers)
    }

@@ -4772,18 +4746,21 @@ impl Timeline {
        detach_ancestor::prepare(self, tenant, options, ctx).await
    }

-    /// Completes the ancestor detach. This method is to be called while holding the
-    /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
-    /// timeline be deleted. After this method returns successfully, tenant must be reloaded.
+    /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and
+    /// reparents any reparentable children of previous ancestor.
    ///
-    /// Pageserver receiving a SIGKILL during this operation is not supported (yet).
-    pub(crate) async fn complete_detaching_timeline_ancestor(
+    /// This method is to be called while holding the TenantManager's tenant slot, so during this
+    /// method we cannot be deleted nor can any timeline be deleted. After this method returns
+    /// successfully, tenant must be reloaded.
+    ///
+    /// Final step will be to complete after optionally resetting the tenant.
+    pub(crate) async fn detach_from_ancestor_and_reparent(
        self: &Arc<Timeline>,
        tenant: &crate::tenant::Tenant,
        prepared: detach_ancestor::PreparedTimelineDetach,
        ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
-        detach_ancestor::complete(self, tenant, prepared, ctx).await
+    ) -> Result<detach_ancestor::DetachingAndReparenting, anyhow::Error> {
+        detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
    }

    /// Switch aux file policy and schedule upload to the index part.
@@ -5474,22 +5451,20 @@ impl Timeline {
                } else {
                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                };
-                let res = self
+
+                let img = match self
                    .walredo_mgr
                    .as_ref()
                    .context("timeline has no walredo manager")
                    .map_err(PageReconstructError::WalRedo)?
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .await;
-                let img = match res {
+                    .await
+                    .context("reconstruct a page image")
+                {
                    Ok(img) => img,
-                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
-                    Err(walredo::Error::Other(e)) => {
-                        return Err(PageReconstructError::WalRedo(
-                            e.context("reconstruct a page image"),
-                        ))
-                    }
+                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                };
+
                Ok(img)
            }
        }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,19 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    // Always ensure the lock order is compaction -> gc.
-    let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
-        compaction_lock,
-        "acquires compaction lock",
-        std::time::Duration::from_secs(5),
-    )
-    .await;
-
-    let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
-        gc_lock,
-        "acquires gc lock",
+    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
+    let guards = crate::timed(
+        guards,
+        "acquire gc and compaction locks",
        std::time::Duration::from_secs(5),
    )
    .await;
@@ -116,8 +107,7 @@ pub(super) async fn delete_local_timeline_directory(
        .context("fsync_pre_mark_remove")?;

    info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
+    drop(guards);

    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -216,10 +206,11 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all)]
+    #[instrument(skip_all, fields(%inplace))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
+        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();

@@ -230,6 +221,8 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

+        tenant.ongoing_timeline_detach.on_delete(&timeline);
+
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
@@ -244,7 +237,11 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,967 +0,0 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
-//!
-//! # Motivation
-//!
-//! On a single page service connection, we're typically serving a single TenantTimelineId.
-//!
-//! Without sharding, there is a single Timeline object to which we dispatch
-//! all requests. For example, a getpage request gets dispatched to the
-//! Timeline::get method of the Timeline object that represents the
-//! (tenant,timeline) of that connection.
-//!
-//! With sharding, for each request that comes in on the connection,
-//! we first have to perform shard routing based on the requested key (=~ page number).
-//! The result of shard routing is a Timeline object.
-//! We then dispatch the request to that Timeline object.
-//!
-//! Regardless of whether the tenant is sharded or not, we want to ensure that
-//! we hold the Timeline gate open while we're invoking the method on the
-//! Timeline object.
-//!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
-//!
-//! Regardless of how we accomplish the above, it should not
-//! prevent the Timeline from shutting down promptly.
-//!
-//! # Design
-//!
-//! There are three user-facing data structures:
-//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
-//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
-//!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
-//!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
-//!
-//! To dispatch a request, the page service connection calls `Cache::get`.
-//!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
-//!
-//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
-//!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
-//!
-//! # Memory Management / How The Reference Cycle Is Broken
-//!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
-//!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
-//!
-//! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
-//!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
-//! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
-//!
-//! # Fast Path for Shard Routing
-//!
-//! The `Cache` has a fast path for shard routing to avoid calling into
-//! the tenant manager for every request.
-//!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
-//!
-//! The current implementation uses the first entry in the hash map
-//! to determine the `ShardParameters` and derive the correct
-//! `ShardIndex` for the requested key.
-//!
-//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
-//!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
-//! it's a hit.
-//!
-//! ## Cache invalidation
-//!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
-//! The only reasons why an entry in the cache can become stale are:
-//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
-//!    being detached, timeline or shard deleted, or pageserver is shutting down.
-//! 2. We're doing a shard split and new traffic should be routed to the child shards.
-//!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
-//! timeline has shut down, and when that happens, we remove the entry from the cache.
-//!
-//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
-//! to the parent shard during a shard split. Eventually, the shard split task will
-//! shut down the parent => case (1).
-
-use std::collections::hash_map;
-use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::sync::Weak;
-
-use pageserver_api::shard::ShardIdentity;
-use tracing::instrument;
-use tracing::trace;
-use utils::id::TimelineId;
-use utils::shard::ShardIndex;
-use utils::shard::ShardNumber;
-
-use crate::tenant::mgr::ShardSelector;
-
-/// The requirement for Debug is so that #[derive(Debug)] works in some places.
-pub(crate) trait Types: Sized + std::fmt::Debug {
-    type TenantManagerError: Sized + std::fmt::Debug;
-    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
-}
-
-/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
-/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
-/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
-struct CacheId(u64);
-
-impl CacheId {
-    fn next() -> Self {
-        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("CacheId::new() returned 0, overflow");
-        }
-        Self(id)
-    }
-}
-
-/// See module-level comment.
-pub(crate) struct Cache<T: Types> {
-    id: CacheId,
-    map: Map<T>,
-}
-
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
-
-impl<T: Types> Default for Cache<T> {
-    fn default() -> Self {
-        Self {
-            id: CacheId::next(),
-            map: Default::default(),
-        }
-    }
-}
-
-#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
-pub(crate) struct ShardTimelineId {
-    pub(crate) shard_index: ShardIndex,
-    pub(crate) timeline_id: TimelineId,
-}
-
-/// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
-///
-/// See module-level comment for details.
-pub struct PerTimelineState<T: Types> {
-    // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
-}
-
-impl<T: Types> Default for PerTimelineState<T> {
-    fn default() -> Self {
-        Self {
-            handles: Mutex::new(Some(Default::default())),
-        }
-    }
-}
-
-/// Abstract view of [`crate::tenant::mgr`], for testability.
-pub(crate) trait TenantManager<T: Types> {
-    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
-    /// Errors are returned as [`GetError::TenantManager`].
-    async fn resolve(
-        &self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> Result<T::Timeline, T::TenantManagerError>;
-}
-
-/// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
-    fn shard_timeline_id(&self) -> ShardTimelineId;
-    fn get_shard_identity(&self) -> &ShardIdentity;
-    fn per_timeline_state(&self) -> &PerTimelineState<T>;
-}
-
-/// Errors returned by [`Cache::get`].
-#[derive(Debug)]
-pub(crate) enum GetError<T: Types> {
-    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
-    PerTimelineStateShutDown,
-}
-
-/// Internal type used in [`Cache::get`].
-enum RoutingResult<T: Types> {
-    FastPath(Handle<T>),
-    SlowPath(ShardTimelineId),
-    NeedConsultTenantManager,
-}
-
-impl<T: Types> Cache<T> {
-    /// See module-level comment for details.
-    ///
-    /// Does NOT check for the shutdown state of [`Types::Timeline`].
-    /// Instead, the methods of [`Types::Timeline`] that are invoked through
-    /// the [`Handle`] are responsible for checking these conditions
-    /// and if so, return an error that causes the page service to
-    /// close the connection.
-    #[instrument(level = "trace", skip_all)]
-    pub(crate) async fn get(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
-            let routing_state = self.shard_routing(timeline_id, shard_selector);
-            match routing_state {
-                RoutingResult::FastPath(handle) => return Ok(handle),
-                RoutingResult::SlowPath(key) => match self.map.get(&key) {
-                    Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
-                            trace!("handle cache stale");
-                            self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
-                        }
-                    },
-                    None => ShardSelector::Known(key.shard_index),
-                },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
-            }
-        };
-        self.get_miss(timeline_id, miss, tenant_manager).await
-    }
-
-    #[inline(always)]
-    fn shard_routing(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> RoutingResult<T> {
-        loop {
-            // terminates because when every iteration we remove an element from the map
-            let Some((first_key, first_handle)) = self.map.iter().next() else {
-                return RoutingResult::NeedConsultTenantManager;
-            };
-            let Some(first_handle) = first_handle.upgrade() else {
-                // TODO: dedup with get()
-                trace!("handle cache stale");
-                let first_key_owned = *first_key;
-                self.map.remove(&first_key_owned).unwrap();
-                continue;
-            };
-
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
-            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_handle_shard_identity.count,
-            };
-
-            let need_idx = match shard_selector {
-                ShardSelector::Page(key) => {
-                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
-                }
-                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
-                ShardSelector::Known(shard_idx) => shard_idx,
-            };
-            let need_shard_timeline_id = ShardTimelineId {
-                shard_index: need_idx,
-                timeline_id,
-            };
-            let first_handle_shard_timeline_id = ShardTimelineId {
-                shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
-            };
-
-            if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
-            } else {
-                return RoutingResult::SlowPath(need_shard_timeline_id);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    #[inline(always)]
-    async fn get_miss(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        match tenant_manager.resolve(timeline_id, shard_selector).await {
-            Ok(timeline) => {
-                let key = timeline.shard_timeline_id();
-                match &shard_selector {
-                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
-                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
-                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
-                }
-
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
-                trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
-                    let mut lock_guard = timeline
-                        .per_timeline_state()
-                        .handles
-                        .lock()
-                        .expect("mutex poisoned");
-                    match &mut *lock_guard {
-                        Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
-                            assert!(replaced.is_none(), "some earlier code left a stale handle");
-                            match self.map.entry(key) {
-                                hash_map::Entry::Occupied(_o) => {
-                                    // This cannot not happen because
-                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
-                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
-                                    //    while we were waiting for the tenant manager.
-                                    unreachable!()
-                                }
-                                hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
-                                }
-                            }
-                        }
-                        None => {
-                            return Err(GetError::PerTimelineStateShutDown);
-                        }
-                    }
-                };
-                Ok(Handle(handle))
-            }
-            Err(e) => Err(GetError::TenantManager(e)),
-        }
-    }
-}
-
-impl<T: Types> PerTimelineState<T> {
-    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
-    /// to the [`Types::Timeline`] that embeds this per-timeline state.
-    /// Even if [`TenantManager::resolve`] would still resolve to it.
-    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
-    /// That's ok because they're short-lived. See module-level comment for details.
-    #[instrument(level = "trace", skip_all)]
-    pub(super) fn shutdown(&self) {
-        let handles = self
-            .handles
-            .lock()
-            .expect("mutex poisoned")
-            // NB: this .take() sets locked to None.
-            // That's what makes future `Cache::get` misses fail.
-            // Cache hits are taken care of below.
-            .take();
-        let Some(handles) = handles else {
-            trace!("already shut down");
-            return;
-        };
-        for handle in handles.values() {
-            // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
-        }
-        drop(handles);
-    }
-}
-
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
-// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
-impl<T: Types> Drop for Cache<T> {
-    fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver_api::{
-        key::{rel_block_to_key, Key, DBDIR_KEY},
-        models::ShardParameters,
-        reltag::RelTag,
-        shard::ShardStripeSize,
-    };
-    use utils::shard::ShardCount;
-
-    use super::*;
-
-    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
-
-    #[derive(Debug)]
-    struct TestTypes;
-    impl Types for TestTypes {
-        type TenantManagerError = anyhow::Error;
-        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
-    }
-
-    struct StubManager {
-        shards: Vec<Arc<StubTimeline>>,
-    }
-
-    struct StubTimeline {
-        gate: utils::sync::gate::Gate,
-        id: TimelineId,
-        shard: ShardIdentity,
-        per_timeline_state: PerTimelineState<TestTypes>,
-        myself: Weak<StubTimeline>,
-    }
-
-    impl StubTimeline {
-        fn getpage(&self) {
-            // do nothing
-        }
-    }
-
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
-        fn shard_timeline_id(&self) -> ShardTimelineId {
-            ShardTimelineId {
-                shard_index: self.shard.shard_index(),
-                timeline_id: self.id,
-            }
-        }
-
-        fn get_shard_identity(&self) -> &ShardIdentity {
-            &self.shard
-        }
-
-        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
-            &self.per_timeline_state
-        }
-    }
-
-    impl TenantManager<TestTypes> for StubManager {
-        async fn resolve(
-            &self,
-            timeline_id: TimelineId,
-            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
-            for timeline in &self.shards {
-                if timeline.id == timeline_id {
-                    match &shard_selector {
-                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Zero => continue,
-                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Page(_) => continue,
-                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Known(_) => continue,
-                    }
-                }
-            }
-            anyhow::bail!("not found")
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_timeline_shutdown() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        //
-        // fill the cache
-        //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        let handle: Handle<_> = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
-        assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-        drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-
-        //
-        // demonstrate that Handle holds up gate closure
-        // but shutdown prevents new handles from being handed out
-        //
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("cache and per-timeline handler state keep cache open");
-            }
-            _ = tokio::time::sleep(FOREVER) => {
-                // NB: first poll of close() makes it enter closing state
-            }
-        }
-
-        let handle = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-
-        // SHUTDOWN
-        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
-
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
-        assert_eq!(
-            cache.map.len(),
-            1,
-            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
-
-        // this handle is perfectly usable
-        handle.getpage();
-
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
-        assert_eq!(
-            cache.map.len(),
-            0,
-            "first access after shutdown cleans up the Weak's from the cache"
-        );
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-
-        drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        // closing gate succeeds after dropping handle
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-
-        // map gets cleaned on next lookup
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 0);
-
-        // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
-        drop(shard0);
-        drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
-    }
-
-    #[tokio::test]
-    async fn test_multiple_timelines_and_deletion() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_a = TimelineId::generate();
-        let timeline_b = TimelineId::generate();
-        assert_ne!(timeline_a, timeline_b);
-        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_a,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_b,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mut mgr = StubManager {
-            shards: vec![timeline_a.clone(), timeline_b.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        assert_eq!(cache.map.len(), 2);
-
-        // delete timeline A
-        timeline_a.per_timeline_state.shutdown();
-        mgr.shards.retain(|t| t.id != timeline_a.id);
-        assert!(
-            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
-                .await
-                .is_err(),
-            "broken StubManager implementation"
-        );
-
-        assert_eq!(
-            cache.map.len(),
-            2,
-            "cache still has a Weak handle to Timeline A"
-        );
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
-
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we still have it");
-    }
-
-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
-        rel_block_to_key(
-            RelTag {
-                spcnode: 1663,
-                dbnode: 208101,
-                relnode: 2620,
-                forknum: 0,
-            },
-            shard.0 as u32 * params.stripe_size.0,
-        )
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_shard_split() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let parent = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_params = ShardParameters {
-            count: ShardCount(2),
-            stripe_size: ShardStripeSize::default(),
-        };
-        let child0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child1 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        // fill the cache with the parent
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![parent.clone()],
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent first"
-            );
-            drop(handle);
-        }
-
-        //
-        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
-        //
-
-        // while we haven't shut down the parent, the cache will return the cached parent, even
-        // if the tenant manager returns the child
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent"
-            );
-            drop(handle);
-        }
-
-        let parent_handle = cache
-            .get(
-                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
-                &StubManager {
-                    shards: vec![parent.clone()],
-                },
-            )
-            .await
-            .expect("we have it");
-        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
-
-        // invalidate the cache
-        parent.per_timeline_state.shutdown();
-
-        // the cache will now return the child, even though the parent handle still exists
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(
-                    &handle.myself,
-                    &child_shards_by_shard_number[i as usize].myself
-                ),
-                "mgr returns child"
-            );
-            drop(handle);
-        }
-
-        // all the while the parent handle kept the parent gate open
-        tokio::select! {
-            _ = parent_handle.gate.close() => {
-                panic!("parent handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-        drop(parent_handle);
-        tokio::select! {
-            _ = parent.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("parent handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_connection_handler_exit() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
-        for _ in 0..10 {
-            let mut cache = Cache::<TestTypes>::default();
-            let handle = {
-                let handle = cache
-                    .get(timeline_id, ShardSelector::Page(key), &mgr)
-                    .await
-                    .expect("we have the timeline");
-                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-                handle
-            };
-            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
-        }
-
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -35,10 +35,6 @@ impl LayerManager {
        self.layer_fmgr.get_from_desc(desc)
    }

-    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
-        self.layer_fmgr.get_from_key(desc)
-    }
-
    /// Get an immutable reference to the layer map.
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
@@ -369,20 +365,16 @@ impl<T> Default for LayerFileManager<T> {
 }

 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
-            .get(key)
-            .with_context(|| format!("get layer from key: {}", key))
+            .get(&desc.key())
+            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
            .expect("not found")
            .clone()
    }

-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
-        self.get_from_key(&desc.key())
-    }
-
    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
        self.0.contains_key(key)
    }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -618,7 +618,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

@@ -685,7 +685,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

@@ -752,7 +752,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

@@ -920,7 +920,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -241,9 +241,6 @@ impl PostgresRedoManager {

    /// Shut down the WAL redo manager.
    ///
-    /// Returns `true` if this call was the one that initiated shutdown.
-    /// `true` may be observed by no caller if the first caller stops polling.
-    ///
    /// After this future completes
    /// - no redo process is running
    /// - no new redo process will be spawned
@@ -253,32 +250,22 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn shutdown(&self) -> bool {
+    pub async fn shutdown(&self) {
        // prevent new processes from being spawned
-        let maybe_permit = match self.redo_process.get_or_init_detached().await {
+        let permit = match self.redo_process.get_or_init_detached().await {
            Ok(guard) => {
-                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
-                    None
-                } else {
-                    let (proc, permit) = guard.take_and_deinit();
-                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
-                    Some(permit)
-                }
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
            }
-            Err(permit) => Some(permit),
-        };
-        let it_was_us = if let Some(permit) = maybe_permit {
-            self.redo_process
-                .set(ProcessOnceCell::ManagerShutDown, permit);
-            true
-        } else {
-            false
+            Err(permit) => permit,
        };
+        self.redo_process
+            .set(ProcessOnceCell::ManagerShutDown, permit);
        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
        // for the underlying process.
        self.launched_processes.close().await;
-        it_was_us
    }

    /// This type doesn't have its own background task to check for idleness: we
--- a/pageserver/test_data/indices/mixed_workload/README.md
+++ b/pageserver/test_data/indices/mixed_workload/README.md
@@ -1,7 +0,0 @@
-
-# This was captured from one shard of a large tenant in staging.
-
-# It has a mixture of deltas and image layers, >1000 layers in total.
-
-# This is suitable for general smoke tests that want an index which is not
-# trivially small, but doesn't contain weird/pathological cases.
--- a/pageserver/test_data/indices/mixed_workload/index_part.json
+++ b/pageserver/test_data/indices/mixed_workload/index_part.json
--- a/poetry.lock
+++ b/poetry.lock
@@ -870,96 +870,6 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}

-[[package]]
-name = "clickhouse-connect"
-version = "0.7.17"
-description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
-optional = false
-python-versions = "~=3.8"
-files = [
-    {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
-]
-
-[package.dependencies]
-certifi = "*"
-lz4 = "*"
-pytz = "*"
-urllib3 = ">=1.26"
-zstandard = "*"
-
-[package.extras]
-arrow = ["pyarrow"]
-numpy = ["numpy"]
-orjson = ["orjson"]
-pandas = ["pandas"]
-sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
-tzlocal = ["tzlocal (>=4.0)"]
-
 [[package]]
 name = "colorama"
 version = "0.4.5"
@@ -1560,56 +1470,6 @@ files = [
    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]

-[[package]]
-name = "lz4"
-version = "4.3.3"
-description = "LZ4 Bindings for Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
-    {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
-    {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
-    {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
-    {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
-    {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
-    {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
-    {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
-    {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
-    {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
-    {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
-    {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
-    {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
-]
-
-[package.extras]
-docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
-flake8 = ["flake8"]
-tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
-
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -2501,17 +2361,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"

-[[package]]
-name = "pytz"
-version = "2024.1"
-description = "World timezone definitions, modern and historical"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
-    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -3357,4 +3206,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -1,7 +1,7 @@
 //! Man-in-the-middle tests
 //!
 //! Channel binding should prevent a proxy server
-//! *that has access to create valid certificates*
+//! - that has access to create valid certificates -
 //! from controlling the TLS connection.

 use std::fmt::Debug;
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -158,7 +158,7 @@ mod tests {
        let N = 1021 * 4096;
        let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);

-        let memory = size_of::<u32>() * sketch.buckets.len();
+        let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
        let time = sketch.depth;
        (memory, time)
    }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,6 @@ zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
-clickhouse-connect = "^0.7.16"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.0"
+channel = "1.79.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -27,7 +27,7 @@ pub const SK_FORMAT_VERSION: u32 = 9;
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
 // needed to atomically update the state using `rename`
 const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
-pub const CHECKSUM_SIZE: usize = size_of::<u32>();
+pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();

 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -143,12 +143,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                self.tenant_id.unwrap_or(TenantId::from([0u8; 16])),
                self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])),
            );
-            tracing::Span::current()
-                .record("ttid", tracing::field::display(ttid))
-                .record(
-                    "application_name",
-                    tracing::field::debug(self.appname.clone()),
-                );
+            tracing::Span::current().record("ttid", tracing::field::display(ttid));

            Ok(())
        } else {
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -21,7 +21,6 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod patch_control_file;
 pub mod pull_timeline;
-pub mod rate_limit;
 pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
@@ -54,7 +53,6 @@ pub mod defaults {
    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
    pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
    pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
-    pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;

    // By default, our required residency before eviction is the same as the period that passes
    // before uploading a partial segment, so that in normal operation the eviction can happen
--- a/safekeeper/src/rate_limit.rs
+++ b/safekeeper/src/rate_limit.rs
@@ -1,49 +0,0 @@
-use std::sync::Arc;
-
-use rand::Rng;
-
-use crate::metrics::MISC_OPERATION_SECONDS;
-
-/// Global rate limiter for background tasks.
-#[derive(Clone)]
-pub struct RateLimiter {
-    partial_backup: Arc<tokio::sync::Semaphore>,
-    eviction: Arc<tokio::sync::Semaphore>,
-}
-
-impl RateLimiter {
-    /// Create a new rate limiter.
-    /// - `partial_backup_max`: maximum number of concurrent partial backups.
-    /// - `eviction_max`: maximum number of concurrent timeline evictions.
-    pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
-        Self {
-            partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
-            eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
-        }
-    }
-
-    /// Get a permit for partial backup. This will block if the maximum number of concurrent
-    /// partial backups is reached.
-    pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
-        let _timer = MISC_OPERATION_SECONDS
-            .with_label_values(&["partial_permit_acquire"])
-            .start_timer();
-        self.partial_backup
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("semaphore is closed")
-    }
-
-    /// Try to get a permit for timeline eviction. This will return None if the maximum number of
-    /// concurrent timeline evictions is reached.
-    pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
-        self.eviction.clone().try_acquire_owned().ok()
-    }
-}
-
-/// Generate a random duration that is a fraction of the given duration.
-pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
-    let randf64 = rand::thread_rng().gen_range(0.0..1.0);
-    duration.mul_f64(randf64)
-}
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -25,7 +25,6 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

-use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
@@ -37,7 +36,7 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::PartialRemoteSegment;
+use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,6 +5,7 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use std::time::Instant;
 use tokio::{
    fs::File,
    io::{AsyncRead, AsyncWriteExt},
@@ -14,7 +15,6 @@ use utils::crashsafe::durable_rename;

 use crate::{
    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
-    rate_limit::rand_duration,
    timeline_manager::{Manager, StateSnapshot},
    wal_backup,
    wal_backup_partial::{self, PartialRemoteSegment},
@@ -50,6 +50,7 @@ impl Manager {
                .flush_lsn
                .segment_number(self.wal_seg_size)
                == self.last_removed_segno + 1
+            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
    }

    /// Evict the timeline to remote storage.
@@ -111,8 +112,7 @@ impl Manager {
            return;
        }

-        self.evict_not_before =
-            tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
+        self.resident_since = Instant::now();

        info!("successfully restored evicted timeline");
    }
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -23,7 +23,6 @@ use utils::lsn::Lsn;
 use crate::{
    control_file::{FileStorage, Storage},
    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
-    rate_limit::{rand_duration, RateLimiter},
    recovery::recovery_main,
    remove_wal::calc_horizon_lsn,
    safekeeper::Term,
@@ -33,7 +32,7 @@ use crate::{
    timeline_guard::{AccessService, GuardId, ResidenceGuard},
    timelines_set::{TimelineSetGuard, TimelinesSet},
    wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
    SafeKeeperConf,
 };

@@ -186,11 +185,11 @@ pub(crate) struct Manager {

    // misc
    pub(crate) access_service: AccessService,
-    pub(crate) global_rate_limiter: RateLimiter,
+    pub(crate) partial_backup_rate_limiter: RateLimiter,

    // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
    // evict them if they go inactive very soon after being restored.
-    pub(crate) evict_not_before: Instant,
+    pub(crate) resident_since: std::time::Instant,
 }

 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -203,7 +202,7 @@ pub async fn main_task(
    broker_active_set: Arc<TimelinesSet>,
    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
    mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
-    global_rate_limiter: RateLimiter,
+    partial_backup_rate_limiter: RateLimiter,
 ) {
    tli.set_status(Status::Started);

@@ -221,7 +220,7 @@ pub async fn main_task(
        conf,
        broker_active_set,
        manager_tx,
-        global_rate_limiter,
+        partial_backup_rate_limiter,
    )
    .await;

@@ -255,29 +254,9 @@ pub async fn main_task(
            mgr.set_status(Status::UpdatePartialBackup);
            mgr.update_partial_backup(&state_snapshot).await;

-            let now = Instant::now();
-            if mgr.evict_not_before > now {
-                // we should wait until evict_not_before
-                update_next_event(&mut next_event, mgr.evict_not_before);
-            }
-
-            if mgr.conf.enable_offload
-                && mgr.evict_not_before <= now
-                && mgr.ready_for_eviction(&next_event, &state_snapshot)
-            {
-                // check rate limiter and evict timeline if possible
-                match mgr.global_rate_limiter.try_acquire_eviction() {
-                    Some(_permit) => {
-                        mgr.set_status(Status::EvictTimeline);
-                        mgr.evict_timeline().await;
-                    }
-                    None => {
-                        // we can't evict timeline now, will try again later
-                        mgr.evict_not_before =
-                            Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
-                        update_next_event(&mut next_event, mgr.evict_not_before);
-                    }
-                }
+            if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
+                mgr.set_status(Status::EvictTimeline);
+                mgr.evict_timeline().await;
            }
        }

@@ -355,10 +334,11 @@ impl Manager {
        conf: SafeKeeperConf,
        broker_active_set: Arc<TimelinesSet>,
        manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
-        global_rate_limiter: RateLimiter,
+        partial_backup_rate_limiter: RateLimiter,
    ) -> Manager {
        let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
        Manager {
+            conf,
            wal_seg_size: tli.get_wal_seg_size().await,
            walsenders: tli.get_walsenders().clone(),
            state_version_rx: tli.get_state_version_rx(),
@@ -373,10 +353,8 @@ impl Manager {
            partial_backup_uploaded,
            access_service: AccessService::new(manager_tx),
            tli,
-            global_rate_limiter,
-            // to smooth out evictions spike after restart
-            evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
-            conf,
+            partial_backup_rate_limiter,
+            resident_since: std::time::Instant::now(),
        }
    }

@@ -563,7 +541,7 @@ impl Manager {
        self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
            self.wal_resident_timeline(),
            self.conf.clone(),
-            self.global_rate_limiter.clone(),
+            self.partial_backup_rate_limiter.clone(),
        )));
    }

--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -2,11 +2,10 @@
 //! All timelines should always be present in this map, this is done by loading them
 //! all from the disk on startup and keeping them in memory.

-use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
-use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -32,7 +31,7 @@ struct GlobalTimelinesState {
    conf: Option<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
-    global_rate_limiter: RateLimiter,
+    partial_backup_rate_limiter: RateLimiter,
 }

 // Used to prevent concurrent timeline loading.
@@ -51,7 +50,7 @@ impl GlobalTimelinesState {
        (
            self.get_conf().clone(),
            self.broker_active_set.clone(),
-            self.global_rate_limiter.clone(),
+            self.partial_backup_rate_limiter.clone(),
        )
    }

@@ -86,7 +85,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
        conf: None,
        broker_active_set: Arc::new(TimelinesSet::default()),
        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
-        global_rate_limiter: RateLimiter::new(1, 1),
+        partial_backup_rate_limiter: RateLimiter::new(1),
    })
 });

@@ -100,10 +99,7 @@ impl GlobalTimelines {
        // lock, so use explicit block
        let tenants_dir = {
            let mut state = TIMELINES_STATE.lock().unwrap();
-            state.global_rate_limiter = RateLimiter::new(
-                conf.partial_backup_concurrency,
-                DEFAULT_EVICTION_CONCURRENCY,
-            );
+            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
            state.conf = Some(conf);

            // Iterate through all directories and load tenants for all directories
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,6 +18,8 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.

+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -28,7 +30,6 @@ use utils::lsn::Lsn;

 use crate::{
    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
-    rate_limit::{rand_duration, RateLimiter},
    safekeeper::Term,
    timeline::WalResidentTimeline,
    timeline_manager::StateSnapshot,
@@ -36,6 +37,30 @@ use crate::{
    SafeKeeperConf,
 };

+#[derive(Clone)]
+pub struct RateLimiter {
+    semaphore: Arc<tokio::sync::Semaphore>,
+}
+
+impl RateLimiter {
+    pub fn new(permits: usize) -> Self {
+        Self {
+            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
+        }
+    }
+
+    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_permit_acquire"])
+            .start_timer();
+        self.semaphore
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("semaphore is closed")
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
    /// Upload is in progress. This status should be used only for garbage collection,
@@ -327,7 +352,6 @@ pub async fn main_task(
 ) -> Option<PartialRemoteSegment> {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;
-    let mut first_iteration = true;

    let (_, persistent_state) = tli.get_state().await;
    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
@@ -395,15 +419,6 @@ pub async fn main_task(
            }
        }

-        // smoothing the load after restart, by sleeping for a random time.
-        // if this is not the first iteration, we will wait for the full await_duration
-        let await_duration = if first_iteration {
-            first_iteration = false;
-            rand_duration(&await_duration)
-        } else {
-            await_duration
-        };
-
        // fixing the segno and waiting some time to prevent reuploading the same segment too often
        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
        let timeout = tokio::time::sleep(await_duration);
@@ -439,7 +454,7 @@ pub async fn main_task(
        }

        // limit concurrent uploads
-        let _upload_permit = limiter.acquire_partial_backup().await;
+        let _upload_permit = limiter.acquire_owned().await;

        let prepared = backup.prepare_upload().await;
        if let Some(seg) = &uploaded_segment {
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -43,7 +43,7 @@ pub async fn task_main(
                    error!("connection handler exited: {}", err);
                }
            }
-            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)),
+            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty)),
        );
    }
 }
--- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -172,7 +172,7 @@ fn write_walrecord_to_disk(
    let mut freespace = insert_freespace(curr_ptr);
    let mut written: usize = 0;

-    assert!(freespace >= size_of::<u32>());
+    assert!(freespace >= std::mem::size_of::<u32>());

    for mut rdata in rdatas {
        while rdata.len() >= freespace {
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -67,7 +67,6 @@ FALLBACK_DURATION = {
    "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
    "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -18,7 +18,6 @@ anyhow.workspace = true
 aws-config.workspace = true
 bytes.workspace = true
 camino.workspace = true
-chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
@@ -32,7 +31,6 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
@@ -46,12 +44,7 @@ scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true

-diesel = { version = "2.1.4", features = [
-    "serde_json",
-    "postgres",
-    "r2d2",
-    "chrono",
-] }
+diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }

@@ -59,3 +52,4 @@ utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
@@ -1 +0,0 @@
-DROP TABLE metadata_health;
--- a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
@@ -1,14 +0,0 @@
-CREATE TABLE metadata_health (
-  tenant_id VARCHAR NOT NULL,
-  shard_number INTEGER NOT NULL,
-  shard_count INTEGER NOT NULL,
-  PRIMARY KEY(tenant_id, shard_number, shard_count),
-  -- Rely on cascade behavior for delete
-  FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
-  healthy BOOLEAN NOT NULL DEFAULT TRUE,
-  last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
-);
-
-
-INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
-SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -3,18 +3,14 @@ use crate::metrics::{
    METRICS_REGISTRY,
 };
 use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
-use pageserver_api::controller_api::{
-    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
-    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    TenantCreateRequest,
-};
+use pageserver_api::controller_api::TenantCreateRequest;
 use pageserver_api::models::{
    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -564,51 +560,6 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Scrubber)?;
-
-    let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    state.service.metadata_health_update(update_req).await?;
-
-    json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
-}
-
-async fn handle_metadata_health_list_unhealthy(
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
-
-    json_response(
-        StatusCode::OK,
-        MetadataHealthListUnhealthyResponse {
-            unhealthy_tenant_shards,
-        },
-    )
-}
-
-async fn handle_metadata_health_list_outdated(
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
-    let state = get_state(&req);
-    let health_records = state
-        .service
-        .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
-        .await?;
-
-    json_response(
-        StatusCode::OK,
-        MetadataHealthListOutdatedResponse { health_records },
-    )
-}
-
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -656,13 +607,6 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
    )
 }

-async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.step_down().await)
-}
-
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -790,47 +734,6 @@ struct RequestMeta {
    at: Instant,
 }

-pub fn prologue_leadership_status_check_middleware<
-    B: hyper::body::HttpBody + Send + Sync + 'static,
->() -> Middleware<B, ApiError> {
-    Middleware::pre(move |req| async move {
-        let state = get_state(&req);
-        let leadership_status = state.service.get_leadership_status();
-
-        enum AllowedRoutes<'a> {
-            All,
-            Some(Vec<&'a str>),
-        }
-
-        let allowed_routes = match leadership_status {
-            LeadershipStatus::Leader => AllowedRoutes::All,
-            LeadershipStatus::SteppedDown => {
-                // TODO: does it make sense to allow /status here?
-                AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
-            }
-            LeadershipStatus::Candidate => {
-                AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
-            }
-        };
-
-        let uri = req.uri().to_string();
-        match allowed_routes {
-            AllowedRoutes::All => Ok(req),
-            AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req),
-            _ => {
-                tracing::info!(
-                    "Request {} not allowed due to current leadership state",
-                    req.uri()
-                );
-
-                Err(ApiError::ResourceUnavailable(
-                    format!("Current leadership status is {leadership_status}").into(),
-                ))
-            }
-        }
-    })
-}
-
 fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
@@ -917,7 +820,6 @@ pub fn make_router(
    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router()
-        .middleware(prologue_leadership_status_check_middleware())
        .middleware(prologue_metrics_middleware())
        .middleware(epilogue_metrics_middleware());
    if auth.is_some() {
@@ -1036,28 +938,6 @@ pub fn make_router(
                RequestName("control_v1_cancel_node_fill"),
            )
        })
-        // Metadata health operations
-        .post("/control/v1/metadata_health/update", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_update,
-                RequestName("control_v1_metadata_health_update"),
-            )
-        })
-        .get("/control/v1/metadata_health/unhealthy", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_list_unhealthy,
-                RequestName("control_v1_metadata_health_list_unhealthy"),
-            )
-        })
-        .post("/control/v1/metadata_health/outdated", |r| {
-            named_request_span(
-                r,
-                handle_metadata_health_list_outdated,
-                RequestName("control_v1_metadata_health_list_outdated"),
-            )
-        })
        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
@@ -1091,9 +971,6 @@ pub fn make_router(
                RequestName("control_v1_tenant_policy"),
            )
        })
-        .put("/control/v1/step_down", |r| {
-            named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,14 +9,12 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
-use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
    RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

@@ -88,10 +86,6 @@ struct Cli {
    // TODO: make `cfg(feature = "testing")`
    #[arg(long)]
    neon_local_repo_dir: Option<PathBuf>,
-
-    /// Chaos testing
-    #[arg(long)]
-    chaos_interval: Option<humantime::Duration>,
 }

 enum StrictMode {
@@ -315,22 +309,6 @@ async fn async_main() -> anyhow::Result<()> {
    tracing::info!("Serving on {0}", args.listen);
    let server_task = tokio::task::spawn(server);

-    let chaos_task = args.chaos_interval.map(|interval| {
-        let service = service.clone();
-        let cancel = CancellationToken::new();
-        let cancel_bg = cancel.clone();
-        (
-            tokio::task::spawn(
-                async move {
-                    let mut chaos_injector = ChaosInjector::new(service, interval.into());
-                    chaos_injector.run(cancel_bg).await
-                }
-                .instrument(tracing::info_span!("chaos_injector")),
-            ),
-            cancel,
-        )
-    });
-
    // Wait until we receive a signal
    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
@@ -359,12 +337,6 @@ async fn async_main() -> anyhow::Result<()> {
        }
    }

-    // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
-    if let Some((chaos_jh, chaos_cancel)) = chaos_task {
-        chaos_cancel.cancel();
-        chaos_jh.await.ok();
-    }
-
    service.shutdown().await;
    tracing::info!("Service shutdown complete");

--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -13,10 +13,7 @@ use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
 use std::sync::Mutex;

-use crate::{
-    persistence::{DatabaseError, DatabaseOperation},
-    service::LeadershipStatus,
-};
+use crate::persistence::{DatabaseError, DatabaseOperation};

 pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
    Lazy::new(StorageControllerMetrics::default);
@@ -84,8 +81,6 @@ pub(crate) struct StorageControllerMetricGroup {
    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_database_query_latency:
        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
-
-    pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
 }

 impl StorageControllerMetrics {
@@ -161,12 +156,6 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
    pub(crate) operation: DatabaseOperation,
 }

-#[derive(measured::LabelGroup)]
-#[label(set = LeadershipStatusGroupSet)]
-pub(crate) struct LeadershipStatusGroup {
-    pub(crate) status: LeadershipStatus,
-}
-
 #[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
    #[label(rename = "ok")]
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,7 +8,6 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
-use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
@@ -91,10 +90,6 @@ pub(crate) enum DatabaseOperation {
    UpdateTenantShard,
    DeleteTenant,
    UpdateTenantConfig,
-    UpdateMetadataHealth,
-    ListMetadataHealth,
-    ListMetadataHealthUnhealthy,
-    ListMetadataHealthOutdated,
 }

 #[must_use]
@@ -312,32 +307,15 @@ impl Persistence {
        &self,
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
-        use crate::schema::metadata_health;
-        use crate::schema::tenant_shards;
-
-        let now = chrono::Utc::now();
-
-        let metadata_health_records = shards
-            .iter()
-            .map(|t| MetadataHealthPersistence {
-                tenant_id: t.tenant_id.clone(),
-                shard_number: t.shard_number,
-                shard_count: t.shard_count,
-                healthy: true,
-                last_scrubbed_at: now,
-            })
-            .collect::<Vec<_>>();
-
+        use crate::schema::tenant_shards::dsl::*;
        self.with_measured_conn(
            DatabaseOperation::InsertTenantShards,
            move |conn| -> DatabaseResult<()> {
-                diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
-
-                diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
            },
        )
@@ -351,10 +329,10 @@ impl Persistence {
        self.with_measured_conn(
            DatabaseOperation::DeleteTenant,
            move |conn| -> DatabaseResult<()> {
-                // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                diesel::delete(tenant_shards)
                    .filter(tenant_id.eq(del_tenant_id.to_string()))
                    .execute(conn)?;
+
                Ok(())
            },
        )
@@ -697,94 +675,6 @@ impl Persistence {
        )
        .await
    }
-
-    /// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
-    ///
-    /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
-    #[allow(dead_code)]
-    pub(crate) async fn update_metadata_health_records(
-        &self,
-        healthy_records: Vec<MetadataHealthPersistence>,
-        unhealthy_records: Vec<MetadataHealthPersistence>,
-        now: chrono::DateTime<chrono::Utc>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::metadata_health::dsl::*;
-
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
-                    .on_conflict((tenant_id, shard_number, shard_count))
-                    .do_update()
-                    .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
-
-                diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
-                    .on_conflict((tenant_id, shard_number, shard_count))
-                    .do_update()
-                    .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
-                Ok(())
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records.
-    #[allow(dead_code)]
-    pub(crate) async fn list_metadata_health_records(
-        &self,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records that is unhealthy.
-    #[allow(dead_code)]
-    pub(crate) async fn list_unhealthy_metadata_health_records(
-        &self,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        use crate::schema::metadata_health::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
-            },
-        )
-        .await
-    }
-
-    /// Lists all the metadata health records that have not been updated since an `earlier` time.
-    #[allow(dead_code)]
-    pub(crate) async fn list_outdated_metadata_health_records(
-        &self,
-        earlier: chrono::DateTime<chrono::Utc>,
-    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        use crate::schema::metadata_health::dsl::*;
-
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
-                let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
-
-                Ok(res)
-            },
-        )
-        .await
-    }
 }

 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -854,59 +744,3 @@ pub(crate) struct NodePersistence {
    pub(crate) listen_pg_addr: String,
    pub(crate) listen_pg_port: i32,
 }
-
-/// Tenant metadata health status that are stored durably.
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
-#[diesel(table_name = crate::schema::metadata_health)]
-pub(crate) struct MetadataHealthPersistence {
-    #[serde(default)]
-    pub(crate) tenant_id: String,
-    #[serde(default)]
-    pub(crate) shard_number: i32,
-    #[serde(default)]
-    pub(crate) shard_count: i32,
-
-    pub(crate) healthy: bool,
-    pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-}
-
-impl MetadataHealthPersistence {
-    pub fn new(
-        tenant_shard_id: TenantShardId,
-        healthy: bool,
-        last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-    ) -> Self {
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_number = tenant_shard_id.shard_number.0 as i32;
-        let shard_count = tenant_shard_id.shard_count.literal() as i32;
-
-        MetadataHealthPersistence {
-            tenant_id,
-            shard_number,
-            shard_count,
-            healthy,
-            last_scrubbed_at,
-        }
-    }
-
-    #[allow(dead_code)]
-    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
-        Ok(TenantShardId {
-            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
-            shard_number: ShardNumber(self.shard_number as u8),
-            shard_count: ShardCount::new(self.shard_count as u8),
-        })
-    }
-}
-
-impl From<MetadataHealthPersistence> for MetadataHealthRecord {
-    fn from(value: MetadataHealthPersistence) -> Self {
-        MetadataHealthRecord {
-            tenant_shard_id: value
-                .get_tenant_shard_id()
-                .expect("stored tenant id should be valid"),
-            healthy: value.healthy,
-            last_scrubbed_at: value.last_scrubbed_at,
-        }
-    }
-}
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -12,7 +12,6 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
-use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
@@ -656,8 +655,11 @@ impl Reconciler {
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.

-                    // Incrementing generation is the safe general case, but is inefficient for changes that only
-                    // modify some details (e.g. the tenant's config).
+                    // The general case is to increment the generation.  However, there are cases
+                    // where this is not necessary:
+                    // - if we are only updating the TenantConf part of the location
+                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
+                    //   and the location was already in the correct generation
                    let increment_generation = match observed {
                        None => true,
                        Some(ObservedStateLocation { conf: None }) => true,
@@ -666,11 +668,18 @@ impl Reconciler {
                        }) => {
                            let generations_match = observed.generation == wanted_conf.generation;

-                            // We may skip incrementing the generation if the location is already in the expected mode and
-                            // generation.  In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
-                            // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
-                            // after a restart/crash, so fall back to the universally safe path of incrementing generation.
-                            !generations_match || (observed.mode != wanted_conf.mode)
+                            use LocationConfigMode::*;
+                            let mode_transition_requires_gen_inc =
+                                match (observed.mode, wanted_conf.mode) {
+                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
+                                    (AttachedSingle, AttachedStale) => false,
+                                    (AttachedMulti, AttachedSingle) => false,
+                                    (lhs, rhs) => lhs != rhs,
+                                };
+
+                            !generations_match || mode_transition_requires_gen_inc
                        }
                    };

@@ -740,8 +749,6 @@ impl Reconciler {
            self.location_config(&node, conf, None, false).await?;
        }

-        failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
-
        Ok(())
    }

--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,15 +1,5 @@
 // @generated automatically by Diesel CLI.

-diesel::table! {
-    metadata_health (tenant_id, shard_number, shard_count) {
-        tenant_id -> Varchar,
-        shard_number -> Int4,
-        shard_count -> Int4,
-        healthy -> Bool,
-        last_scrubbed_at -> Timestamptz,
-    }
-}
-
 diesel::table! {
    nodes (node_id) {
        node_id -> Int8,
@@ -36,4 +26,4 @@ diesel::table! {
    }
 }

-diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -15,8 +15,7 @@ use crate::{
    },
    compute_hook::NotifyError,
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
-    metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
+    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
@@ -33,11 +32,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
-        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -82,9 +81,6 @@ use crate::{
        ReconcilerWaiter, TenantShard,
    },
 };
-use serde::{Deserialize, Serialize};
-
-pub mod chaos_injector;

 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -135,24 +131,6 @@ enum NodeOperations {
    Delete,
 }

-/// The leadership status for the storage controller process.
-/// Allowed transitions are:
-/// 1. Leader -> SteppedDown
-/// 2. Candidate -> Leader
-#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)]
-#[strum(serialize_all = "snake_case")]
-pub(crate) enum LeadershipStatus {
-    /// This is the steady state where the storage controller can produce
-    /// side effects in the cluster.
-    Leader,
-    /// We've been notified to step down by another candidate. No reconciliations
-    /// take place in this state.
-    SteppedDown,
-    /// Initial state for a new storage controller instance. Will attempt to assume leadership.
-    #[allow(unused)]
-    Candidate,
-}
-
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;

 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
@@ -162,8 +140,6 @@ const MAX_DELAYED_RECONCILES: usize = 10000;

 // Top level state available to all HTTP handlers
 struct ServiceState {
-    leadership_status: LeadershipStatus,
-
    tenants: BTreeMap<TenantShardId, TenantShard>,

    nodes: Arc<HashMap<NodeId, Node>>,
@@ -226,21 +202,7 @@ impl ServiceState {
        scheduler: Scheduler,
        delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
    ) -> Self {
-        let status = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_leadership_status;
-
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
-            },
-            1,
-        );
-
        Self {
-            // TODO: Starting up as Leader is a transient state. Once we enable rolling
-            // upgrades on the k8s side, we should start up as Candidate.
-            leadership_status: LeadershipStatus::Leader,
            tenants,
            nodes: Arc::new(nodes),
            scheduler,
@@ -258,37 +220,6 @@ impl ServiceState {
    ) {
        (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
    }
-
-    fn get_leadership_status(&self) -> LeadershipStatus {
-        self.leadership_status
-    }
-
-    fn step_down(&mut self) {
-        self.leadership_status = LeadershipStatus::SteppedDown;
-
-        let status = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_leadership_status;
-
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::SteppedDown,
-            },
-            1,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
-            },
-            0,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Candidate,
-            },
-            0,
-        );
-    }
 }

 #[derive(Clone)]
@@ -472,30 +403,11 @@ struct ShardUpdate {
    generation: Option<Generation>,
 }

-enum StopReconciliationsReason {
-    ShuttingDown,
-    SteppingDown,
-}
-
-impl std::fmt::Display for StopReconciliationsReason {
-    fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let s = match self {
-            Self::ShuttingDown => "Shutting down",
-            Self::SteppingDown => "Stepping down",
-        };
-        write!(writer, "{}", s)
-    }
-}
-
 pub(crate) enum ReconcileResultRequest {
    ReconcileResult(ReconcileResult),
    Stop,
 }

-// TODO: move this into the storcon peer client when that gets added
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
-
 impl Service {
    pub fn get_config(&self) -> &Config {
        &self.config
@@ -2930,6 +2842,7 @@ impl Service {
            );

            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
            client
                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
                .await
@@ -2946,7 +2859,8 @@ impl Service {
                        Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
                        }
-                        // rest can be mapped
+                        // rest can be mapped as usual
+                        // FIXME: this converts some 500 to 409 which is not per openapi
                        other => passthrough_api_error(&node, other),
                    }
                })
@@ -2954,7 +2868,6 @@ impl Service {
        }

        // no shard needs to go first/last; the operation should be idempotent
-        // TODO: it would be great to ensure that all shards return the same error
        let mut results = self
            .tenant_for_shards(targets, |tenant_shard_id, node| {
                futures::FutureExt::boxed(detach_one(
@@ -2973,6 +2886,7 @@ impl Service {
            .filter(|(_, res)| res != &any.1)
            .collect::<Vec<_>>();
        if !mismatching.is_empty() {
+            // this can be hit by races which should not happen because operation lock on cplane
            let matching = results.len() - mismatching.len();
            tracing::error!(
                matching,
@@ -5691,22 +5605,17 @@ impl Service {
        Ok(std::cmp::max(waiter_count, reconciles_spawned))
    }

-    async fn stop_reconciliations(&self, reason: StopReconciliationsReason) {
+    pub async fn shutdown(&self) {
        // Cancel all on-going reconciles and wait for them to exit the gate.
-        tracing::info!("{reason}: cancelling and waiting for in-flight reconciles");
+        tracing::info!("Shutting down: cancelling and waiting for in-flight reconciles");
        self.reconcilers_cancel.cancel();
        self.reconcilers_gate.close().await;

        // Signal the background loop in [`Service::process_results`] to exit once
        // it has proccessed the results from all the reconciles we cancelled earlier.
-        tracing::info!("{reason}: processing results from previously in-flight reconciles");
+        tracing::info!("Shutting down: processing results from previously in-flight reconciles");
        self.result_tx.send(ReconcileResultRequest::Stop).ok();
        self.result_tx.closed().await;
-    }
-
-    pub async fn shutdown(&self) {
-        self.stop_reconciliations(StopReconciliationsReason::ShuttingDown)
-            .await;

        // Background tasks hold gate guards: this notifies them of the cancellation and
        // waits for them all to complete.
@@ -6096,89 +6005,4 @@ impl Service {

        Ok(())
    }
-
-    /// Updates scrubber metadata health check results.
-    pub(crate) async fn metadata_health_update(
-        &self,
-        update_req: MetadataHealthUpdateRequest,
-    ) -> Result<(), ApiError> {
-        let now = chrono::offset::Utc::now();
-        let (healthy_records, unhealthy_records) = {
-            let locked = self.inner.read().unwrap();
-            let healthy_records = update_req
-                .healthy_tenant_shards
-                .into_iter()
-                // Retain only health records associated with tenant shards managed by storage controller.
-                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
-                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
-                .collect();
-            let unhealthy_records = update_req
-                .unhealthy_tenant_shards
-                .into_iter()
-                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
-                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
-                .collect();
-
-            (healthy_records, unhealthy_records)
-        };
-
-        self.persistence
-            .update_metadata_health_records(healthy_records, unhealthy_records, now)
-            .await?;
-        Ok(())
-    }
-
-    /// Lists the tenant shards that has unhealthy metadata status.
-    pub(crate) async fn metadata_health_list_unhealthy(
-        &self,
-    ) -> Result<Vec<TenantShardId>, ApiError> {
-        let result = self
-            .persistence
-            .list_unhealthy_metadata_health_records()
-            .await?
-            .iter()
-            .map(|p| p.get_tenant_shard_id().unwrap())
-            .collect();
-
-        Ok(result)
-    }
-
-    /// Lists the tenant shards that have not been scrubbed for some duration.
-    pub(crate) async fn metadata_health_list_outdated(
-        &self,
-        not_scrubbed_for: Duration,
-    ) -> Result<Vec<MetadataHealthRecord>, ApiError> {
-        let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
-        let result = self
-            .persistence
-            .list_outdated_metadata_health_records(earlier)
-            .await?
-            .into_iter()
-            .map(|record| record.into())
-            .collect();
-        Ok(result)
-    }
-
-    pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
-        self.inner.read().unwrap().get_leadership_status()
-    }
-
-    pub(crate) async fn step_down(&self) -> GlobalObservedState {
-        tracing::info!("Received step down request from peer");
-
-        self.inner.write().unwrap().step_down();
-        // TODO: would it make sense to have a time-out for this?
-        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
-            .await;
-
-        let mut global_observed = GlobalObservedState::default();
-        let locked = self.inner.read().unwrap();
-        for (tid, tenant_shard) in locked.tenants.iter() {
-            global_observed
-                .0
-                .insert(*tid, tenant_shard.observed.clone());
-        }
-
-        global_observed
-    }
 }
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,71 +0,0 @@
-use std::{sync::Arc, time::Duration};
-
-use rand::seq::SliceRandom;
-use rand::thread_rng;
-use tokio_util::sync::CancellationToken;
-
-use super::Service;
-
-pub struct ChaosInjector {
-    service: Arc<Service>,
-    interval: Duration,
-}
-
-impl ChaosInjector {
-    pub fn new(service: Arc<Service>, interval: Duration) -> Self {
-        Self { service, interval }
-    }
-
-    pub async fn run(&mut self, cancel: CancellationToken) {
-        let mut interval = tokio::time::interval(self.interval);
-
-        loop {
-            tokio::select! {
-                _ = interval.tick() => {}
-                _ = cancel.cancelled() => {
-                    tracing::info!("Shutting down");
-                    return;
-                }
-            }
-
-            self.inject_chaos().await;
-
-            tracing::info!("Chaos iteration...");
-        }
-    }
-
-    async fn inject_chaos(&mut self) {
-        // Pick some shards to interfere with
-        let batch_size = 128;
-        let mut inner = self.service.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = inner.parts_mut();
-        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
-        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
-
-        for victim in victims {
-            let shard = tenants
-                .get_mut(victim)
-                .expect("Held lock between choosing ID and this get");
-
-            // Pick a secondary to promote
-            let Some(new_location) = shard
-                .intent
-                .get_secondary()
-                .choose(&mut thread_rng())
-                .cloned()
-            else {
-                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
-                continue;
-            };
-
-            let Some(old_location) = *shard.intent.get_attached() else {
-                tracing::info!("Skipping shard {victim}: currently has no attached location");
-                continue;
-            };
-
-            shard.intent.demote_attached(scheduler, old_location);
-            shard.intent.promote_attached(scheduler, new_location);
-            self.service.maybe_reconcile_shard(shard, nodes);
-        }
-    }
-}
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -18,7 +18,7 @@ use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
 };
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::{instrument, Instrument};
@@ -284,7 +284,7 @@ impl Drop for IntentState {
    }
 }

-#[derive(Default, Clone, Serialize, Deserialize, Debug)]
+#[derive(Default, Clone, Serialize)]
 pub(crate) struct ObservedState {
    pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
 }
@@ -298,7 +298,7 @@ pub(crate) struct ObservedState {
 ///       what it is (e.g. we failed partway through configuring it)
 ///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
 ///       and that configuration will still be present unless something external interfered.
-#[derive(Clone, Serialize, Deserialize, Debug)]
+#[derive(Clone, Serialize)]
 pub(crate) struct ObservedStateLocation {
    /// If None, it means we do not know the status of this shard's location on this node, but
    /// we know that we might have some state on this node.
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -10,7 +10,6 @@ aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
-git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -40,11 +40,6 @@ impl TimelineAnalysis {
            garbage_keys: Vec::new(),
        }
    }
-
-    /// Whether a timeline is healthy.
-    pub(crate) fn is_healthy(&self) -> bool {
-        self.errors.is_empty() && self.warnings.is_empty()
-    }
 }

 pub(crate) async fn branch_cleanup_and_check_errors(
@@ -92,8 +87,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                            .push(format!("index_part.json version: {}", index_part.version()))
                    }

-                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
-                    if !newest_versions.any(|ip| ip == &index_part.version()) {
+                    if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
                        info!(
                            "index_part.json version is not latest: {}",
                            index_part.version()
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,13 +1,10 @@
-use std::pin::pin;
-
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
-use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};

 use crate::{
-    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
-    stream_objects_with_retries, BucketConfig, NodeKind,
+    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
+    metadata_stream::stream_tenants, BucketConfig, NodeKind,
 };

 #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -50,38 +47,45 @@ pub async fn find_large_objects(
    ignore_deltas: bool,
    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));

    let objects_stream = tenants.map_ok(|tenant_shard_id| {
        let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        let remote_client = remote_client.clone();
+        let s3_client = s3_client.clone();
        async move {
            let mut objects = Vec::new();
            let mut total_objects_ctr = 0u64;
            // We want the objects and not just common prefixes
            tenant_root.delimiter.clear();
-            let mut objects_stream = pin!(stream_objects_with_retries(
-                &remote_client,
-                ListingMode::NoDelimiter,
-                &tenant_root
-            ));
-            while let Some(listing) = objects_stream.next().await {
-                let listing = listing?;
-                for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
-                    let key = obj.key.to_string();
+            let mut continuation_token = None;
+            loop {
+                let fetch_response =
+                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                        .await?;
+                for obj in fetch_response.contents().iter().filter(|o| {
+                    if let Some(obj_size) = o.size {
+                        min_size as i64 <= obj_size
+                    } else {
+                        false
+                    }
+                }) {
+                    let key = obj.key().expect("couldn't get key").to_owned();
                    let kind = LargeObjectKind::from_key(&key);
                    if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
                        continue;
                    }
                    objects.push(LargeObject {
                        key,
-                        size: obj.size,
+                        size: obj.size.unwrap() as u64,
                        kind,
                    })
                }
-                total_objects_ctr += listing.keys.len() as u64;
+                total_objects_ctr += fetch_response.contents().len() as u64;
+                match fetch_response.next_continuation_token {
+                    Some(new_token) => continuation_token = Some(new_token),
+                    None => break,
+                }
            }

            Ok((tenant_shard_id, objects, total_objects_ctr))
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -5,7 +5,6 @@
 use std::{
    collections::{HashMap, HashSet},
    sync::Arc,
-    time::Duration,
 };

 use anyhow::Context;
@@ -19,8 +18,8 @@ use utils::id::TenantId;

 use crate::{
    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote_generic, list_objects_with_retries_generic,
-    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
+    init_remote, init_remote_generic,
+    metadata_stream::{stream_tenant_timelines, stream_tenants},
    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };

@@ -28,11 +27,6 @@ use crate::{
 enum GarbageReason {
    DeletedInConsole,
    MissingInConsole,
-
-    // The remaining data relates to a known deletion issue, and we're sure that purging this
-    // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
-    // there is nothing in a tenant path apart from a heatmap file.
-    KnownBug,
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -78,15 +72,6 @@ impl GarbageList {
        }
    }

-    /// If an entity has been identified as requiring purge due to a known bug, e.g.
-    /// a particular type of object left behind after an incomplete deletion.
-    fn append_buggy(&mut self, entity: GarbageEntity) {
-        self.items.push(GarbageItem {
-            entity,
-            reason: GarbageReason::KnownBug,
-        });
-    }
-
    /// Return true if appended, false if not.  False means the result was not garbage.
    fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
    where
@@ -153,7 +138,7 @@ async fn find_garbage_inner(
    node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
    // Construct clients for S3 and for Console API
-    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));

    // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +164,7 @@ async fn find_garbage_inner(

    // Enumerate Tenants in S3, and check if each one exists in Console
    tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants_generic(&remote_client, &target);
+    let tenants = stream_tenants(&s3_client, &target);
    let tenants_checked = tenants.map_ok(|t| {
        let api_client = cloud_admin_api_client.clone();
        let console_cache = console_cache.clone();
@@ -234,66 +219,6 @@ async fn find_garbage_inner(
            assert!(project.tenant == tenant_shard_id.tenant_id);
        }

-        // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
-        // identify it as purge-able anyway
-        if console_result.is_none() {
-            let timelines =
-                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await;
-            if timelines.is_empty() {
-                // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries_generic(
-                    &remote_client,
-                    ListingMode::WithDelimiter,
-                    &target.tenant_root(&tenant_shard_id),
-                )
-                .await?;
-                let object = tenant_objects.keys.first().unwrap();
-                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
-                } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
-                }
-            } else {
-                // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
-                // rollout of WAL DR in which we never deleted these.
-                let mut any_non_initdb = false;
-
-                for timeline_r in timelines {
-                    let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries_generic(
-                        &remote_client,
-                        ListingMode::WithDelimiter,
-                        &target.timeline_root(&timeline),
-                    )
-                    .await?;
-                    if !timeline_objects.prefixes.is_empty() {
-                        // Sub-paths?  Unexpected
-                        any_non_initdb = true;
-                    } else {
-                        let object = timeline_objects.keys.first().unwrap();
-                        if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
-                            tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
-                        } else {
-                            any_non_initdb = true;
-                        }
-                    }
-                }
-
-                if any_non_initdb {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
-                } else {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
-                }
-            }
-        }
-
        if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
            tracing::debug!("Tenant {tenant_shard_id} is garbage");
        } else {
@@ -331,8 +256,7 @@ async fn find_garbage_inner(

    // Construct a stream of all timelines within active tenants
    let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines =
-        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
+    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
    let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
    let timelines = timelines.try_flatten();

@@ -425,6 +349,9 @@ pub async fn get_timeline_objects(
    tracing::debug!("Listing objects in timeline {ttid}");
    let timeline_root = super::remote_timeline_path_id(&ttid);

+    // TODO: apply extra validation based on object modification time.  Don't purge
+    // timelines whose index_part.json has been touched recently.
+
    let list = s3_client
        .list(
            Some(&timeline_root),
@@ -495,7 +422,6 @@ impl DeletionProgressTracker {
 pub async fn purge_garbage(
    input_path: String,
    mode: PurgeMode,
-    min_age: Duration,
    dry_run: bool,
 ) -> anyhow::Result<()> {
    let list_bytes = tokio::fs::read(&input_path).await?;
@@ -506,7 +432,7 @@ pub async fn purge_garbage(
        input_path
    );

-    let (remote_client, _target) =
+    let remote_client =
        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;

    assert_eq!(
@@ -533,7 +459,6 @@ pub async fn purge_garbage(
        .filter(|i| match (&mode, &i.reason) {
            (PurgeMode::DeletedAndMissing, _) => true,
            (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
-            (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
            (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
        });

@@ -562,37 +487,6 @@ pub async fn purge_garbage(
    let mut progress_tracker = DeletionProgressTracker::default();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
-
-        // Extra safety check: even if a collection of objects is garbage, check max() of modification
-        // times before purging, so that if we incorrectly marked a live tenant as garbage then we would
-        // notice that its index has been written recently and would omit deleting it.
-        if object_list.is_empty() {
-            // Simplify subsequent code by ensuring list always has at least one item
-            // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
-            continue;
-        }
-        let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
-        let age = max_mtime.elapsed();
-        match age {
-            Err(_) => {
-                tracing::warn!("Bad last_modified time");
-                continue;
-            }
-            Ok(a) if a < min_age => {
-                // Failed age check.  This doesn't mean we did something wrong: a tenant might really be garbage and recently
-                // written, but out of an abundance of caution we still don't purge it.
-                tracing::info!(
-                    "Skipping tenant with young objects {}..{}",
-                    object_list.first().as_ref().unwrap().key,
-                    object_list.last().as_ref().unwrap().key
-                );
-                continue;
-            }
-            Ok(_) => {
-                // Passed age check
-            }
-        }
-
        objects_to_delete.append(&mut object_list);
        if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
            do_delete(
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -16,26 +16,22 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, Context};
-use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
 use aws_sdk_s3::Client;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
-use futures::{Stream, StreamExt};
 use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{
-    GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
-use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -257,12 +253,6 @@ pub struct ControllerClientConfig {
    pub controller_jwt: String,
 }

-impl ControllerClientConfig {
-    pub fn build_client(self) -> control_api::Client {
-        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
-    }
-}
-
 pub struct ConsoleConfig {
    pub token: String,
    pub base_url: Url,
@@ -315,15 +305,8 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
 }

 async fn init_s3_client(bucket_region: Region) -> Client {
-    let mut retry_config_builder = RetryConfigBuilder::new();
-
-    retry_config_builder
-        .set_max_attempts(Some(3))
-        .set_mode(Some(RetryMode::Adaptive));
-
    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
        .region(bucket_region)
-        .retry_config(retry_config_builder.build())
        .load()
        .await;
    Client::new(&config)
@@ -336,35 +319,27 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
    }
 }

-fn make_root_target(
-    bucket_name: String,
-    prefix_in_bucket: String,
-    node_kind: NodeKind,
-) -> RootTarget {
-    let s3_target = S3Target {
-        bucket_name,
-        prefix_in_bucket,
-        delimiter: "/".to_string(),
-    };
-    match node_kind {
-        NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
-        NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
-    }
-}
-
 async fn init_remote(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
    let bucket_region = Region::new(bucket_config.region);
+    let delimiter = "/".to_string();
    let s3_client = Arc::new(init_s3_client(bucket_region).await);
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();

-    let s3_root = make_root_target(
-        bucket_config.bucket,
-        bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-        node_kind,
-    );
+    let s3_root = match node_kind {
+        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            delimiter,
+        }),
+        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            delimiter,
+        }),
+    };

    Ok((s3_client, s3_root))
 }
@@ -372,12 +347,12 @@ async fn init_remote(
 async fn init_remote_generic(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
-) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
+) -> anyhow::Result<GenericRemoteStorage> {
    let endpoint = env::var("AWS_ENDPOINT_URL").ok();
    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
    let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
    let storage = S3Config {
-        bucket_name: bucket_config.bucket.clone(),
+        bucket_name: bucket_config.bucket,
        bucket_region: bucket_config.region,
        prefix_in_bucket,
        endpoint,
@@ -391,13 +366,7 @@ async fn init_remote_generic(
        storage: RemoteStorageKind::AwsS3(storage),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
-
-    // We already pass the prefix to the remote client above
-    let prefix_in_root_target = String::new();
-    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
-
-    let client = GenericRemoteStorage::from_config(&storage_config).await?;
-    Ok((client, s3_root))
+    GenericRemoteStorage::from_config(&storage_config).await
 }

 async fn list_objects_with_retries(
@@ -435,84 +404,6 @@ async fn list_objects_with_retries(
    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }

-/// Listing possibly large amounts of keys in a streaming fashion.
-fn stream_objects_with_retries<'a>(
-    storage_client: &'a GenericRemoteStorage,
-    listing_mode: ListingMode,
-    s3_target: &'a S3Target,
-) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
-    async_stream::stream! {
-        let mut trial = 0;
-        let cancel = CancellationToken::new();
-        let prefix_str = &s3_target
-            .prefix_in_bucket
-            .strip_prefix("/")
-            .unwrap_or(&s3_target.prefix_in_bucket);
-        let prefix = RemotePath::from_string(prefix_str)?;
-        let mut list_stream =
-            storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
-        while let Some(res) = list_stream.next().await {
-            if let Err(err) = res {
-                let yield_err = if err.is_permanent() {
-                    true
-                } else {
-                    let backoff_time = 1 << trial.max(5);
-                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-                    trial += 1;
-                    trial == MAX_RETRIES - 1
-                };
-                if yield_err {
-                    yield Err(err)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                    break;
-                }
-            } else {
-                trial = 0;
-                yield res.map_err(anyhow::Error::from);
-            }
-        }
-    }
-}
-
-/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
-/// use [`stream_objects_with_retries`] instead.
-async fn list_objects_with_retries_generic(
-    remote_client: &GenericRemoteStorage,
-    listing_mode: ListingMode,
-    s3_target: &S3Target,
-) -> anyhow::Result<Listing> {
-    let cancel = CancellationToken::new();
-    let prefix_str = &s3_target
-        .prefix_in_bucket
-        .strip_prefix("/")
-        .unwrap_or(&s3_target.prefix_in_bucket);
-    let prefix = RemotePath::from_string(prefix_str)?;
-    for trial in 0..MAX_RETRIES {
-        match remote_client
-            .list(Some(&prefix), listing_mode, None, &cancel)
-            .await
-        {
-            Ok(response) => return Ok(response),
-            Err(e) => {
-                if trial == MAX_RETRIES - 1 {
-                    return Err(e)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                }
-                error!(
-                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
-                    s3_target.bucket_name,
-                    s3_target.prefix_in_bucket,
-                    s3_target.delimiter,
-                    DisplayErrorContext(e),
-                );
-                let backoff_time = 1 << trial.max(5);
-                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-            }
-        }
-    }
-    panic!("MAX_RETRIES is not allowed to be 0");
-}
-
 async fn download_object_with_retries(
    s3_client: &Client,
    bucket_name: &str,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Joonas Koivunen	dcfd92b6db	test: funroll-loop first iteration in test_retried_detach_ancestor_after_failed_reparenting	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f58636ffdd	test: refactor -- begin to -funroll-loops in test_retried_detach_ancestor_after_failed_reparenting	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f3ac5bcbe1	test: ensure gc is unpaused with the earlier deletion test	2024-07-26 14:39:32 +00:00
Joonas Koivunen	eb3711b881	doc: why no cancel	2024-07-26 14:39:32 +00:00
Joonas Koivunen	c864166b32	test: make sure gc gets unblocked by late deletion	2024-07-26 14:39:32 +00:00
Joonas Koivunen	ce9b5ae7bf	test: allow the 500 error crutch temporarily	2024-07-26 14:39:32 +00:00
Joonas Koivunen	cd2cbe0691	test: rename test_deletion_after_timeline_ancestor_detach_before_completion	2024-07-26 14:39:32 +00:00
Joonas Koivunen	7f241bd379	refactor: remove needless Error::from	2024-07-26 14:39:32 +00:00
Joonas Koivunen	ff52901028	refactor: still_ongoing assert is shared in all paths	2024-07-26 14:39:32 +00:00
Joonas Koivunen	bb377a3544	fixup: make sure detach_ancestor is blocking gc	2024-07-26 14:39:32 +00:00
Joonas Koivunen	5ece7af497	doc: remove confusing comment	2024-07-26 14:39:32 +00:00
Joonas Koivunen	2be3027fa5	doc: elaborate on weird query	2024-07-26 14:39:32 +00:00
Joonas Koivunen	14a0517c7f	also assert still ongoing	2024-07-26 14:39:32 +00:00
Joonas Koivunen	dcff25c293	chore: adjust assert message	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f80c37b733	chore: forgotten to update panic text with detach_and_reparent renaming	2024-07-26 14:39:32 +00:00
Joonas Koivunen	b9d0b26cea	doc: remove possibly wrong comment	2024-07-26 14:39:32 +00:00
Joonas Koivunen	c2c28f211b	doc: explain returning option	2024-07-26 14:39:32 +00:00
Joonas Koivunen	1ebcb1c45b	doc: clean out FIXME we cannot protect against willful misuse. I had been thinking of witness of Attempt but ...	2024-07-26 14:39:32 +00:00
Joonas Koivunen	66d750ec20	info log on detach	2024-07-26 14:39:32 +00:00
Joonas Koivunen	ba3a6645e7	fix: info log line again, botched rebase?	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8885a8c482	fixup: missed hashset change	2024-07-26 14:39:32 +00:00
Joonas Koivunen	c8880b69fb	stop with the (ancestor_lsn, timeline_id) ordered reparented I was thinking of the case where we have multiple reparented at the same ancestor_lsn. But of course, that is not a problem if we compare the reparented as a set...	2024-07-26 14:39:32 +00:00
Joonas Koivunen	274b2a611b	test: handle the case where timeline cannot be found at least do not double-panick.	2024-07-26 14:39:32 +00:00
Joonas Koivunen	a7153bf9b2	test: forgotten allowed errors	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8a4236a441	test: remove needless s3 storage	2024-07-26 14:39:32 +00:00
Joonas Koivunen	7ec927e43b	test: cleanup todos	2024-07-26 14:39:32 +00:00
Joonas Koivunen	22470ef444	test: comment	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8248cbb45b	test: ensure persisted gc blocking works across restart	2024-07-26 14:39:32 +00:00
Joonas Koivunen	4dd805b68a	test: remove the extra deletion which was confusing it had already been reparented, so it was not needed.	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f582675452	test: refactor repetition	2024-07-26 14:39:32 +00:00
Joonas Koivunen	48069f68bb	chore: forgotten pyfmt	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8f52139913	additional assert in completion	2024-07-26 14:39:32 +00:00
Joonas Koivunen	fc4d80bbf2	elaborate on TODO for which a test is later added	2024-07-26 14:39:32 +00:00
Joonas Koivunen	dc83a5a978	fixup dae8c75c04 test: cannot be parametrized over return or exit	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f4fb08d869	stop masking the topmost error in http handler	2024-07-26 14:39:31 +00:00
Joonas Koivunen	75b326faf4	test: complete fixmes	2024-07-26 14:39:31 +00:00
Joonas Koivunen	c23cd5c149	ongoing_detach_ancestor => gc_blocking in index_part	2024-07-26 14:39:31 +00:00
Joonas Koivunen	f4cd9fe40b	refactor: misc after attempt to add lock_in_reparentable	2024-07-26 14:39:31 +00:00
Joonas Koivunen	43af9484c0	doc: schedule_reparenting_and_wait	2024-07-26 14:39:31 +00:00
Joonas Koivunen	842bd4c2db	refactor: reparentable_timelines query out	2024-07-26 14:39:31 +00:00
Joonas Koivunen	ada9a46dca	remove done fixme, minor reformattting	2024-07-26 14:39:31 +00:00
Joonas Koivunen	742fcac7b9	refactor: use partialeq more	2024-07-26 14:39:31 +00:00
Joonas Koivunen	55aeeb5765	allow deleting timeline unblock gc	2024-07-26 14:39:31 +00:00
Joonas Koivunen	89426570d3	relax overly strict comparisons	2024-07-26 14:39:31 +00:00
Joonas Koivunen	7f767ca18e	fix: must_restart condition	2024-07-26 14:39:31 +00:00
Joonas Koivunen	1348dbf0f1	doc: comment cleanup	2024-07-26 14:39:31 +00:00
Joonas Koivunen	a179283f86	always notify gc_waiting when writing over the witness tracking	2024-07-26 14:39:31 +00:00
Joonas Koivunen	deb86c1ea1	remodel the return type	2024-07-26 14:39:31 +00:00
Joonas Koivunen	dfdf40916f	rename complete_detaching_from_ancestor it hasn't meant completing in a while now :)	2024-07-26 14:39:31 +00:00
Joonas Koivunen	c6d8015fe9	chore: clippy needless into_iter	2024-07-26 14:39:31 +00:00
Joonas Koivunen	b2233d557b	test: complicate to include added paths	2024-07-26 14:39:31 +00:00
Joonas Koivunen	ce2552ba67	minor comment update for FIXME about 503	2024-07-26 14:39:31 +00:00
Joonas Koivunen	f4d773bb89	refactor: unify t::s::Semaphore	2024-07-26 14:39:31 +00:00
Joonas Koivunen	6f28263428	refactor: failpoint all but one	2024-07-26 14:39:31 +00:00
Joonas Koivunen	1e380ea5af	refactor: Ancestor::Delete is not needed	2024-07-26 14:39:31 +00:00
Joonas Koivunen	8258385301	remove indentation level with exhaustive match	2024-07-26 14:39:31 +00:00
Joonas Koivunen	6a8f00dea0	fix: return reparented_direct_children in case we reparent nothing new	2024-07-26 14:39:31 +00:00
Joonas Koivunen	44cdb9fb58	refactor: reparented_direct_children query	2024-07-26 14:39:31 +00:00
Joonas Koivunen	cdfaf0700f	fix: bifurcate the detach+reparent step	2024-07-26 14:39:31 +00:00
Joonas Koivunen	881e1ad056	refactor: no need to collect reparentable here	2024-07-26 14:39:31 +00:00
Joonas Koivunen	bb3d70e24d	fix: properly cancel if any reparenting failed	2024-07-26 14:39:31 +00:00
Joonas Koivunen	c6c560e4c8	rewrite to include testing assertion	2024-07-26 14:39:31 +00:00
Joonas Koivunen	8dd332aed5	doc: remove unnecessary comment	2024-07-26 14:39:31 +00:00
Joonas Koivunen	5c03a17eb8	wip: some progress now we hit the todo! in "already detached" path.	2024-07-26 14:39:31 +00:00
Joonas Koivunen	402d66778e	make reparenting operations idempotent	2024-07-26 14:39:31 +00:00
Joonas Koivunen	39e2bc932f	prepare to reparent while gc blocked	2024-07-26 14:39:31 +00:00
Joonas Koivunen	5fc034fa7f	feat: block gc persistently until detach ancestor completes	2024-07-26 14:39:31 +00:00
Joonas Koivunen	f9b12def0b	add support for WaitToActivate errors	2024-07-26 14:39:31 +00:00
Joonas Koivunen	5d0071447c	partial: index_part.json support for ongoing_detach_ancestor	2024-07-26 14:39:31 +00:00
Joonas Koivunen	d9eba3f8c3	==== PR cut here?	2024-07-26 14:39:31 +00:00
Joonas Koivunen	409e2eff9e	fix: run upload_rewritten_layer in a span there was a weird failure observed with CI tests: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8430/10108870590/index.html#suites/a1c2be32556270764423c495fad75d47/94a4686382b96297	2024-07-26 14:39:31 +00:00
Joonas Koivunen	e6e3b9a716	doc: remove on_gc_task_start fixme	2024-07-26 08:52:55 +00:00
Joonas Koivunen	7f31a3f671	forgotten rename, maybe	2024-07-26 08:52:55 +00:00
Joonas Koivunen	9971ae3d24	rename is_detached_from_{original_,}ancestor (just the rename)	2024-07-26 08:52:55 +00:00
Joonas Koivunen	48a2a20de3	chore: derive default	2024-07-26 08:52:55 +00:00
Joonas Koivunen	29ef8f15ce	chore: unused variable	2024-07-26 08:52:55 +00:00
Joonas Koivunen	5e45dd3f86	rename SharedState::notify to continue_existing_attempt	2024-07-26 08:52:55 +00:00
Joonas Koivunen	5fced442d7	warning caused by removed body	2024-07-26 08:52:55 +00:00
Joonas Koivunen	4222610233	cleanup index part dependent	2024-07-26 08:52:55 +00:00
Joonas Koivunen	92deb0dfd7	plumbing: collect timelines index parts	2024-07-26 08:52:55 +00:00
Joonas Koivunen	46ca6f17c5	plumbing: notify shared state of existing attempt	2024-07-26 08:52:55 +00:00
Joonas Koivunen	14869abb77	complete the plumbing with non-notifying attempt_blocks_gc impl	2024-07-26 08:52:55 +00:00
Joonas Koivunen	5330fd9366	doc(fixme): shared state	2024-07-26 08:52:55 +00:00
Joonas Koivunen	6c5b3b7812	doc: more sketched api comments	2024-07-26 08:52:55 +00:00
Joonas Koivunen	849fe0f191	plumb the shared state through the api for the gc pausing is quite awkward.	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f564b66f21	shared state sketch	2024-07-26 08:52:55 +00:00
Joonas Koivunen	2e58ccee78	temp: planning	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f398ab0264	completion: Debug and query for barrier connection	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f23ee2ccdb	doc(test): be more accurate	2024-07-26 08:52:55 +00:00
Joonas Koivunen	0ad31bb7fb	doc: remove obsolete FIXME this was cleared with partial metadata updates.	2024-07-26 08:52:55 +00:00
Joonas Koivunen	86f26d0918	chore: minor rename FIXME in IndexPart	2024-07-26 08:52:55 +00:00
Joonas Koivunen	4a562dff2e	doc: more	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f9185b42a9	doc: minor enhancements	2024-07-26 08:52:55 +00:00
Joonas Koivunen	d4f30daa81	chore: minor indentation problem	2024-07-26 08:52:55 +00:00
Joonas Koivunen	97ab53e826	chore: add std::fmt::Debug for Barrier	2024-07-26 08:52:55 +00:00