test: funroll-loop first iteration in test_retried_detach_ancestor_after_failed_reparenting

test: refactor -- begin to -funroll-loops in test_retried_detach_ancestor_after_failed_reparenting
test: ensure gc is unpaused with the earlier deletion test
2026-05-30 19:40:39 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00 · 2024-07-26 14:39:32 +00:00
191 changed files with 4614 additions and 9826 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,8 +8,6 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - BENCHMARK_PROJECT_ID_PUB
-  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,8 +14,11 @@ inputs:
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
+  provisioner:
+    description: 'k8s-pod or k8s-neonvm'
+    default: 'k8s-pod'
  compute_units:
-    description: '[Min, Max] compute units'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'

 outputs:
@@ -34,6 +37,10 @@ runs:
      # A shell without `set -x` to not to expose password/dsn in logs
      shell: bash -euo pipefail {0}
      run: |
+        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
+          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
+        fi
+
        project=$(curl \
          "https://${API_HOST}/api/v2/projects" \
          --fail \
@@ -45,7 +52,7 @@ runs:
              \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
              \"pg_version\": ${POSTGRES_VERSION},
              \"region_id\": \"${REGION_ID}\",
-              \"provisioner\": \"k8s-neonvm\",
+              \"provisioner\": \"${PROVISIONER}\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
              \"settings\": { }
@@ -68,5 +75,6 @@ runs:
        API_KEY: ${{ inputs.api_key }}
        REGION_ID: ${{ inputs.region_id }}
        POSTGRES_VERSION: ${{ inputs.postgres_version }}
+        PROVISIONER: ${{ inputs.provisioner }}
        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -131,8 +131,8 @@ runs:
          exit 1
        fi
        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n sets the number of parallel processes that pytest-xdist will run
-          EXTRA_PARAMS="-n12 $EXTRA_PARAMS"
+          # -n16 uses sixteen processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n16 $EXTRA_PARAMS"

          # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
          # to the same worker to make @pytest.mark.order work with xdist
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,10 +19,6 @@ on:
        description: 'debug or release'
        required: true
        type: string
-      pg-versions:
-        description: 'a json array of postgres versions to run regression tests on'
-        required: true
-        type: string

 defaults:
  run:
@@ -258,7 +254,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pg_version: ${{ fromJson(inputs.pg-versions) }}
+        pg_version: [ v14, v15, v16 ]
    steps:
      - uses: actions/checkout@v4
        with:
@@ -282,11 +278,14 @@ jobs:
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
      - name: Merge and upload coverage data
        if: |
          false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
+          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,23 +56,18 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    permissions:
-      contents: read
-      id-token: write # Required for OIDC authentication.
    strategy:
      fail-fast: false
      matrix:
        include:
-          # - DEFAULT_PG_VERSION: 16
-          #   PLATFORM: "neon-staging"
-          #   region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-          #   RUNNER: [ self-hosted, us-east-2, x64 ]
-          #   IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+          - DEFAULT_PG_VERSION: 16
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
-            RUNNER: [ self-hosted, eastus2, azure ]
-            IMAGE: neondatabase/build-tools:pinned
+            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -83,20 +78,14 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: ${{ matrix.RUNNER }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.IMAGE }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
    - uses: actions/checkout@v4

-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: us-east-2
-        role-to-assume: ${{ secrets.DEV_AWS_OIDC_ROLE_ARN }}
-    - run: aws sts get-caller-identity
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -111,6 +100,7 @@ jobs:
        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -160,7 +150,7 @@ jobs:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -181,7 +171,7 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    - name: Run Logical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -189,15 +179,12 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
-        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}

-    - name: Run Physical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -229,11 +216,11 @@ jobs:
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
-    # - neonvm-captest-new: Freshly created project (1 CU)
-    # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neon-captest-new: Freshly created project (1 CU)
+    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
-    # - neonvm-captest-reuse: Reusing existing project
+    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
@@ -250,8 +237,6 @@ jobs:
      id: pgbench-compare-matrix
      run: |
        region_id_default=${{ env.DEFAULT_REGION_ID }}
-        runner_default="[ self-hosted, us-east-2, x64 ]"
-        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
        matrix='{
          "pg_version" : [
            16
@@ -260,24 +245,23 @@ jobs:
            "'"$region_id_default"'"
            ],
          "platform": [
-            "neonvm-captest-new",
-            "neonvm-captest-reuse",
+            "neon-captest-new",
+            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "runner": [ "'"$runner_default"'" ],
-          "image": [ "'"$image_default"'" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": "'"$runner_default"'", "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": "'"$runner_default"'", "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": "'"$runner_default"'", "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": "[ self-hosted, eastus2, azure ]", "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": "[ self-hosted, eastus2, azure ]", "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": "[ self-hosted, eastus2, azure ]", "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb",,"runner": "'"$runner_default"'", "image": "'"$image_default"'" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": "'"$runner_default"'", "image": "'"$image_default"'" }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -287,7 +271,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ]
        }'

@@ -303,7 +287,7 @@ jobs:
      run: |
        matrix='{
          "platform": [
-            "neonvm-captest-reuse"
+            "neon-captest-reuse"
          ],
          "scale": [
            "10"
@@ -335,9 +319,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: ${{ matrix.runner }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.image }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -354,7 +338,7 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
@@ -362,18 +346,19 @@ jobs:
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
+        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -457,13 +442,9 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - PLATFORM: "neonvm-captest-pgvector"
-            RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+          - PLATFORM: "neon-captest-pgvector"
          - PLATFORM: "azure-captest-pgvector"
-            RUNNER: [ self-hosted, eastus2, azure ]
-            IMAGE: neondatabase/build-tools:pinned
-
+            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -475,9 +456,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: ${{ matrix.RUNNER }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.IMAGE }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -505,7 +486,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-pgvector)
+          neon-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
            ;;
          azure-captest-pgvector)
@@ -604,7 +585,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
            ;;
          rds-aurora)
@@ -614,7 +595,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -691,7 +672,7 @@ jobs:
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
@@ -701,7 +682,7 @@ jobs:
            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -778,7 +759,7 @@ jobs:
      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
-          neonvm-captest-reuse)
+          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
            ;;
          rds-aurora)
@@ -788,7 +769,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,8 +203,7 @@ jobs:
      fail-fast: false
      matrix:
        arch: [ x64 ]
-        # Do not build or run tests in debug for release branches
-        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
+        build-type: [ debug, release ]
        include:
          - build-type: release
            arch: arm64
@@ -214,8 +213,6 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -289,6 +286,9 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: false
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -309,7 +309,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -836,9 +836,6 @@ jobs:
          rm -rf .docker-custom

  promote-images:
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

@@ -865,28 +862,6 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

-      - name: Azure login
-        if: github.ref_name == 'main'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: github.ref_name == 'main'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Copy docker images to ACR-dev
-        if: github.ref_name == 'main'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create \
-              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
-                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -13,7 +13,6 @@ on:
    paths:
      - '.github/workflows/pg-clients.yml'
      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
      - 'poetry.lock'
  workflow_dispatch:

@@ -50,101 +49,6 @@ jobs:
      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit

-  test-logical-replication:
-    needs: [ build-build-tools-image ]
-    runs-on: ubuntu-22.04
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init --user root
-    services:
-      clickhouse:
-        image: clickhouse/clickhouse-server:24.6.3.64
-        ports:
-          - 9000:9000
-          - 8123:8123
-      zookeeper:
-        image: quay.io/debezium/zookeeper:2.7
-        ports:
-          - 2181:2181
-      kafka:
-        image: quay.io/debezium/kafka:2.7
-        env:
-          ZOOKEEPER_CONNECT: "zookeeper:2181"
-          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
-          KAFKA_BROKER_ID: 1
-          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-          KAFKA_JMX_PORT: 9991
-        ports:
-          - 9092:9092
-      debezium:
-        image: quay.io/debezium/connect:2.7
-        env:
-          BOOTSTRAP_SERVERS: kafka:9092
-          GROUP_ID: 1
-          CONFIG_STORAGE_TOPIC: debezium-config
-          OFFSET_STORAGE_TOPIC: debezium-offset
-          STATUS_STORAGE_TOPIC: debezium-status
-          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
-        ports:
-          - 8083:8083
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Download Neon artifact
-        uses: ./.github/actions/download
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-          path: /tmp/neon/
-          prefix: latest
-
-      - name: Create Neon Project
-        id: create-neon-project
-        uses: ./.github/actions/neon-project-create
-        with:
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-      - name: Run tests
-        uses: ./.github/actions/run-python-test-set
-        with:
-          build_type: remote
-          test_selection: logical_repl
-          run_in_parallel: false
-          extra_params: -m remote_cluster
-          pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        env:
-          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-
-      - name: Delete Neon Project
-        if: always()
-        uses: ./.github/actions/neon-project-delete
-        with:
-          project_id: ${{ steps.create-neon-project.outputs.project_id }}
-          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-      - name: Create Allure report
-        if: ${{ !cancelled() }}
-        id: create-allure-report
-        uses: ./.github/actions/allure-report-generate
-        with:
-          store-test-results-into-db: true
-        env:
-          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-
-      - name: Post to a Slack channel
-        if: github.event.schedule && failure()
-        uses: slackapi/slack-github-action@v1
-        with:
-          channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
-          slack-message: |
-            Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  test-postgres-client-libs:
    needs: [ build-build-tools-image ]
    runs-on: ubuntu-22.04
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -66,22 +66,8 @@ jobs:
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Azure login
-        if: steps.check-manifests.outputs.skip == 'false'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
        if: steps.check-manifests.outputs.skip == 'false'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -13,6 +13,8 @@ defaults:
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  cancel-previous-e2e-tests:
@@ -62,35 +64,19 @@ jobs:
    needs: [ tag ]
    runs-on: ubuntu-22.04
    env:
-      EVENT_ACTION: ${{ github.event.action }}
-      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
-      - name: Wait for `promote-images` job to finish
-        # It's important to have a timeout here, the script in the step can run infinitely
-        timeout-minutes: 60
+      - name: check if ecr image are present
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
-          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
-            exit 0
-          fi
-
-          # For PRs we use the run id as the tag
-          BUILD_AND_TEST_RUN_ID=${TAG}
-          while true; do
-            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
-            case "$conclusion" in
-              success)
-                break
-                ;;
-              failure | cancelled | skipped)
-                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
-                exit 1
-                ;;
-              *)
-                echo "The 'promote-images' hasn't succeed yet. Waiting..."
-                sleep 60
-                ;;
-            esac
+          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
+            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
+            if [ "$OUTPUT" == "" ]; then
+              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
+              exit 1
+            fi
          done

      - name: Set e2e-platforms
--- a/8
+++ b/8
@@ -1,13 +1,13 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /storage_controller @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
-/safekeeper/ @neondatabase/storage
+/safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1418,7 +1418,7 @@ dependencies = [
 "clap",
 "criterion-plot",
 "is-terminal",
- "itertools 0.10.5",
+ "itertools",
 "num-traits",
 "once_cell",
 "oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
 "cast",
- "itertools 0.10.5",
+ "itertools",
 ]

 [[package]]
@@ -1672,7 +1672,6 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
- "chrono",
 "diesel_derives",
 "itoa",
 "pq-sys",
@@ -2134,12 +2133,6 @@ dependencies = [
 "slab",
 ]

-[[package]]
-name = "gen_ops"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
-
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2716,6 +2709,17 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"

+[[package]]
+name = "io-lifetimes"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "io-uring"
 version = "0.6.2"
@@ -2734,13 +2738,14 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"

 [[package]]
 name = "is-terminal"
-version = "0.4.12"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
 "hermit-abi",
- "libc",
- "windows-sys 0.52.0",
+ "io-lifetimes",
+ "rustix 0.37.25",
+ "windows-sys 0.48.0",
 ]

 [[package]]
@@ -2752,15 +2757,6 @@ dependencies = [
 "either",
 ]

-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itoa"
 version = "1.0.6"
@@ -2875,6 +2871,18 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"

+[[package]]
+name = "linux-raw-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -2992,7 +3000,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
- "procfs",
+ "procfs 0.16.0",
 ]

 [[package]]
@@ -3037,7 +3045,7 @@ dependencies = [
 "measured",
 "measured-process",
 "once_cell",
- "procfs",
+ "procfs 0.14.2",
 "prometheus",
 "rand 0.8.5",
 "rand_distr",
@@ -3566,7 +3574,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "leaky-bucket",
 "md5",
 "metrics",
@@ -3584,9 +3592,8 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs",
+ "procfs 0.14.2",
 "rand 0.8.5",
- "range-set-blaze",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
@@ -3637,7 +3644,7 @@ dependencies = [
 "hex",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
@@ -3695,7 +3702,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -4027,7 +4034,7 @@ name = "postgres_connection"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "postgres",
 "tokio-postgres",
@@ -4085,7 +4092,7 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools 0.10.5",
+ "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -4131,6 +4138,21 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "procfs"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
+dependencies = [
+ "bitflags 1.3.2",
+ "byteorder",
+ "chrono",
+ "flate2",
+ "hex",
+ "lazy_static",
+ "rustix 0.36.16",
+]
+
 [[package]]
 name = "procfs"
 version = "0.16.0"
@@ -4138,12 +4160,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
- "flate2",
 "hex",
 "lazy_static",
 "procfs-core",
- "rustix",
+ "rustix 0.38.28",
 ]

 [[package]]
@@ -4153,15 +4173,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
 "hex",
 ]

 [[package]]
 name = "prometheus"
-version = "0.13.4"
+version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
 dependencies = [
 "cfg-if",
 "fnv",
@@ -4169,7 +4188,7 @@ dependencies = [
 "libc",
 "memchr",
 "parking_lot 0.12.1",
- "procfs",
+ "procfs 0.14.2",
 "thiserror",
 ]

@@ -4191,7 +4210,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
 "bytes",
 "heck 0.4.1",
- "itertools 0.10.5",
+ "itertools",
 "lazy_static",
 "log",
 "multimap",
@@ -4212,7 +4231,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "proc-macro2",
 "quote",
 "syn 1.0.109",
@@ -4269,7 +4288,7 @@ dependencies = [
 "hyper-util",
 "indexmap 2.0.1",
 "ipnet",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "md5",
 "measured",
@@ -4324,7 +4343,6 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
- "try-lock",
 "typed-json",
 "url",
 "urlencoding",
@@ -4446,18 +4464,6 @@ dependencies = [
 "rand_core 0.5.1",
 ]

-[[package]]
-name = "range-set-blaze"
-version = "0.1.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
-dependencies = [
- "gen_ops",
- "itertools 0.12.1",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -4626,7 +4632,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pin-project-lite",
@@ -4936,6 +4942,34 @@ dependencies = [
 "nom",
 ]

+[[package]]
+name = "rustix"
+version = "0.36.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.1.4",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "rustix"
 version = "0.38.28"
@@ -5684,7 +5718,6 @@ dependencies = [
 "aws-config",
 "bytes",
 "camino",
- "chrono",
 "clap",
 "control_plane",
 "diesel",
@@ -5695,7 +5728,7 @@ dependencies = [
 "hex",
 "humantime",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "measured",
 "metrics",
@@ -5704,7 +5737,6 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "r2d2",
- "rand 0.8.5",
 "reqwest 0.12.4",
 "routerify",
 "scopeguard",
@@ -5760,10 +5792,9 @@ dependencies = [
 "either",
 "futures",
 "futures-util",
- "git-version",
 "hex",
 "humantime",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "pageserver",
 "pageserver_api",
@@ -5940,15 +5971,15 @@ dependencies = [

 [[package]]
 name = "tempfile"
-version = "3.9.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
 dependencies = [
 "cfg-if",
- "fastrand 2.0.0",
- "redox_syscall 0.4.1",
- "rustix",
- "windows-sys 0.52.0",
+ "fastrand 1.9.0",
+ "redox_syscall 0.3.5",
+ "rustix 0.37.25",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -6564,9 +6595,9 @@ dependencies = [

 [[package]]
 name = "try-lock"
-version = "0.2.5"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"

 [[package]]
 name = "tungstenite"
@@ -7145,6 +7176,15 @@ dependencies = [
 "windows_x86_64_msvc 0.42.2",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7163,6 +7203,21 @@ dependencies = [
 "windows-targets 0.52.4",
 ]

+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
@@ -7392,7 +7447,7 @@ dependencies = [
 "hmac",
 "hyper 0.14.26",
 "indexmap 1.9.3",
- "itertools 0.10.5",
+ "itertools",
 "libc",
 "log",
 "memchr",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.16"
+procfs = "0.14"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
@@ -184,7 +184,6 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
-try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.0
+ENV RUSTC_VERSION=1.79.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -933,8 +933,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -946,7 +945,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -961,7 +960,6 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,11 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test specific features.
-testing = []
-
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,15 +400,7 @@ impl ComputeNode {
    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let mut retry_period_ms = 500.0;
        let mut attempts = 0;
-        const DEFAULT_ATTEMPTS: u16 = 10;
-        #[cfg(feature = "testing")]
-        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
-            u16::from_str(&v).unwrap()
-        } else {
-            DEFAULT_ATTEMPTS
-        };
-        #[cfg(not(feature = "testing"))]
-        let max_attempts = DEFAULT_ATTEMPTS;
+        let max_attempts = 10;
        loop {
            let result = self.try_get_basebackup(compute_state, lsn);
            match result {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {

 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_") {
+        if var.starts_with("NEON_PAGESERVER_") {
            cmd = cmd.env(var, val);
        }
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -514,6 +514,7 @@ impl LocalEnv {
                #[derive(serde::Serialize, serde::Deserialize)]
                // (allow unknown fields, unlike PageServerConf)
                struct PageserverConfigTomlSubset {
+                    id: NodeId,
                    listen_pg_addr: String,
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
@@ -525,30 +526,18 @@ impl LocalEnv {
                        .with_context(|| format!("read {:?}", config_toml_path))?,
                )
                .context("parse pageserver.toml")?;
-                let identity_toml_path = dentry.path().join("identity.toml");
-                #[derive(serde::Serialize, serde::Deserialize)]
-                struct IdentityTomlSubset {
-                    id: NodeId,
-                }
-                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
-                    &std::fs::read_to_string(&identity_toml_path)
-                        .with_context(|| format!("read {:?}", identity_toml_path))?,
-                )
-                .context("parse identity.toml")?;
                let PageserverConfigTomlSubset {
+                    id: config_toml_id,
                    listen_pg_addr,
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
                } = config_toml;
-                let IdentityTomlSubset {
-                    id: identity_toml_id,
-                } = identity_toml;
                let conf = PageServerConf {
                    id: {
                        anyhow::ensure!(
-                            identity_toml_id == id,
-                            "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
+                            config_toml_id == id,
+                            "id mismatch: config_toml.id={config_toml_id} id={id}",
                        );
                        id
                    },
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -127,13 +127,10 @@ impl PageServerNode {
        }

        // Apply the user-provided overrides
-        overrides.push({
-            let mut doc =
-                toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
-            // `id` is written out to `identity.toml` instead of `pageserver.toml`
-            doc.remove("id").expect("it's part of the struct");
-            doc.to_string()
-        });
+        overrides.push(
+            toml_edit::ser::to_string_pretty(&conf)
+                .expect("we deserialized this from toml earlier"),
+        );

        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
            cleanup
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x

-cd /ext-src || exit 2
+cd /ext-src
 FAILED=
-LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d "${d}" ] || continue
+       [ -d ${d} ] || continue
    psql -c "select 1" >/dev/null || break
-       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo "${FAILED}"
+echo ${FAILED}
 exit 1
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,6 +1,5 @@
-use std::collections::HashSet;
 use std::str::FromStr;
-use std::time::{Duration, Instant};
+use std::time::Instant;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -295,42 +294,6 @@ pub enum PlacementPolicy {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

-/// Metadata health record posted from scrubber.
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthRecord {
-    pub tenant_shard_id: TenantShardId,
-    pub healthy: bool,
-    pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateRequest {
-    pub healthy_tenant_shards: HashSet<TenantShardId>,
-    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct MetadataHealthUpdateResponse {}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListUnhealthyResponse {
-    pub unhealthy_tenant_shards: Vec<TenantShardId>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedRequest {
-    #[serde(with = "humantime_serde")]
-    pub not_scrubbed_for: Duration,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-
-pub struct MetadataHealthListOutdatedResponse {
-    pub health_records: Vec<MetadataHealthRecord>,
-}
-
 #[cfg(test)]
 mod test {
    use super::*;
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -107,10 +107,7 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(
-            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
-            "invalid key: {self}",
-        );
+        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,13 +637,6 @@ pub struct TenantInfo {
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
    pub generation: u32,
-
-    /// Opaque explanation if gc is being blocked.
-    ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub gc_blocking: Option<String>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -947,8 +940,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::path::PathBuf;
-
    #[derive(
        Copy,
        Clone,
@@ -967,53 +958,6 @@ pub mod virtual_file {
        #[cfg(target_os = "linux")]
        TokioEpollUring,
    }
-
-    /// Direct IO modes for a pageserver.
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-    pub enum DirectIoMode {
-        /// Direct IO disabled (uses usual buffered IO).
-        #[default]
-        Disabled,
-        /// Direct IO disabled (performs checks and perf simulations).
-        Evaluate {
-            /// Alignment check level
-            alignment_check: DirectIoAlignmentCheckLevel,
-            /// Latency padded for performance simulation.
-            latency_padding: DirectIoLatencyPadding,
-        },
-        /// Direct IO enabled.
-        Enabled {
-            /// Actions to perform on alignment error.
-            on_alignment_error: DirectIoOnAlignmentErrorAction,
-        },
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoAlignmentCheckLevel {
-        #[default]
-        Error,
-        Log,
-        None,
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoOnAlignmentErrorAction {
-        Error,
-        #[default]
-        FallbackToBuffered,
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "type", rename_all = "kebab-case")]
-    pub enum DirectIoLatencyPadding {
-        /// Pad virtual file operations with IO to a fake file.
-        FakeFileRW { path: PathBuf },
-        #[default]
-        None,
-    }
 }

 // Wrapped in libpq CopyData
@@ -1483,7 +1427,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1506,7 +1449,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/postgres_ffi/src/controlfile_utils.rs
+++ b/libs/postgres_ffi/src/controlfile_utils.rs
@@ -29,7 +29,7 @@ use anyhow::{bail, Result};
 use bytes::{Bytes, BytesMut};

 /// Equivalent to sizeof(ControlFileData) in C
-const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
+const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();

 impl ControlFileData {
    /// Compute the offset of the `crc` field within the `ControlFileData` struct.
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
 //

 // Assumes 8 byte alignment
-const SIZEOF_PAGE_HEADER_DATA: usize = size_of::<PageHeaderData>();
+const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::<PageHeaderData>();
 pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7;

 //
@@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;

-pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
+pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::<XLogRecord>() as u32;

 //
 // from xlogrecord.h
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;

-pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::<XLogPageHeaderData>();
-pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::<XLogLongPageHeaderData>();
-pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::<XLogRecord>();
+pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

@@ -311,7 +311,7 @@ impl XLogLongPageHeaderData {
    }
 }

-pub const SIZEOF_CHECKPOINT: usize = size_of::<CheckPoint>();
+pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();

 impl CheckPoint {
    pub fn encode(&self) -> Result<Bytes, SerializeError> {
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() {
 /// currently 1024.
 #[test]
 pub fn test_update_next_xid() {
-    let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();

    checkpoint.nextXid = FullTransactionId { value: 10 };
@@ -204,7 +204,7 @@ pub fn test_update_next_xid() {

 #[test]
 pub fn test_update_next_multixid() {
-    let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();

    // simple case
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -355,8 +355,7 @@ impl RemoteStorage for AzureBlobStorage {
                    .blobs()
                    .map(|k| ListingObject{
                        key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into(),
-                        size: k.properties.content_length,
+                        last_modified: k.properties.last_modified.into()
                    }
                    );

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,7 +144,6 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
-#[derive(Copy, Clone)]
 pub enum ListingMode {
    WithDelimiter,
    NoDelimiter,
@@ -154,7 +153,6 @@ pub enum ListingMode {
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
-    pub size: u64,
 }

 #[derive(Default)]
@@ -196,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>>;

    async fn list(
        &self,
@@ -353,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
        match self {
            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -368,7 +368,6 @@ impl RemoteStorage for LocalFs {
                            key: k.clone(),
                            // LocalFs is just for testing, so just specify a dummy time
                            last_modified: SystemTime::now(),
-                            size: 0,
                        })
                    }
                })
@@ -412,7 +411,6 @@ impl RemoteStorage for LocalFs {
                            key: RemotePath::from_string(&relative_key).unwrap(),
                            // LocalFs is just for testing
                            last_modified: SystemTime::now(),
-                            size: 0,
                        });
                    }
                }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -565,12 +565,9 @@ impl RemoteStorage for S3Bucket {
                        }
                    };

-                    let size = object.size.unwrap_or(0) as u64;
-
                    result.keys.push(ListingObject{
                        key,
-                        last_modified,
-                        size,
+                        last_modified
                    });
                    if let Some(mut mk) = max_keys {
                        assert!(mk > 0);
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
        async_stream::stream! {
            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
                .map_err(DownloadError::Other)?;
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
-    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
+    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
    // TODO: join these two?
    Tenant,
-    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
-    /// Should only be used e.g. for status check/tenant creation/list.
+    // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+    // Should only be used e.g. for status check/tenant creation/list.
    PageServerApi,
-    /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
-    /// Should only be used e.g. for status check.
-    /// Currently also used for connection from any pageserver to any safekeeper.
+    // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+    // Should only be used e.g. for status check.
+    // Currently also used for connection from any pageserver to any safekeeper.
    SafekeeperData,
-    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
+    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    /// Allows access to control plane managment API and some storage controller endpoints.
+    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,

    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -8,10 +8,33 @@ pub struct Completion {
    _token: TaskTrackerToken,
 }

+impl std::fmt::Debug for Completion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Completion")
+            .field("siblings", &self._token.task_tracker().len())
+            .finish()
+    }
+}
+
+impl Completion {
+    /// Returns true if this completion is associated with the given barrier.
+    pub fn blocks(&self, barrier: &Barrier) -> bool {
+        TaskTracker::ptr_eq(self._token.task_tracker(), &barrier.0)
+    }
+}
+
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
 pub struct Barrier(TaskTracker);

+impl std::fmt::Debug for Barrier {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Barrier")
+            .field("remaining", &self.0.len())
+            .finish()
+    }
+}
+
 impl Default for Barrier {
    fn default() -> Self {
        let (_, rx) = channel();
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -78,9 +78,8 @@ impl Drop for GateGuard {
    }
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(Debug)]
 pub enum GateError {
-    #[error("gate is closed")]
    GateClosed,
 }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,7 +49,6 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
-range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
@@ -108,7 +107,3 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
-
-[[bench]]
-name = "bench_ingest"
-harness = false
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -1,239 +0,0 @@
-use std::{env, num::NonZeroUsize};
-
-use bytes::Bytes;
-use camino::Utf8PathBuf;
-use criterion::{criterion_group, criterion_main, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    context::{DownloadBehavior, RequestContext},
-    l0_flush::{L0FlushConfig, L0FlushGlobalState},
-    page_cache,
-    repository::Value,
-    task_mgr::TaskKind,
-    tenant::storage_layer::InMemoryLayer,
-    virtual_file,
-};
-use pageserver_api::{key::Key, shard::TenantShardId};
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-};
-
-// A very cheap hash for generating non-sequential keys.
-fn murmurhash32(mut h: u32) -> u32 {
-    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
-    h ^= h >> 13;
-    h = h.wrapping_mul(0xc2b2ae35);
-    h ^= h >> 16;
-    h
-}
-
-enum KeyLayout {
-    /// Sequential unique keys
-    Sequential,
-    /// Random unique keys
-    Random,
-    /// Random keys, but only use the bits from the mask of them
-    RandomReuse(u32),
-}
-
-enum WriteDelta {
-    Yes,
-    No,
-}
-
-async fn ingest(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) -> anyhow::Result<()> {
-    let mut lsn = utils::lsn::Lsn(1000);
-    let mut key = Key::from_i128(0x0);
-
-    let timeline_id = TimelineId::generate();
-    let tenant_id = TenantId::generate();
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
-
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-
-    let gate = utils::sync::gate::Gate::default();
-    let entered = gate.enter().unwrap();
-
-    let layer =
-        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
-
-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
-    let ctx = RequestContext::new(
-        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
-        pageserver::context::DownloadBehavior::Download,
-    );
-
-    for i in 0..put_count {
-        lsn += put_size as u64;
-
-        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
-        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
-        match key_layout {
-            KeyLayout::Sequential => {
-                // Use sequential order to illustrate the experience a user is likely to have
-                // when ingesting bulk data.
-                key.field6 = i as u32;
-            }
-            KeyLayout::Random => {
-                // Use random-order keys to avoid giving a false advantage to data structures that are
-                // faster when inserting on the end.
-                key.field6 = murmurhash32(i as u32);
-            }
-            KeyLayout::RandomReuse(mask) => {
-                // Use low bits only, to limit cardinality
-                key.field6 = murmurhash32(i as u32) & mask;
-            }
-        }
-
-        layer.put_value(key, lsn, &data, &ctx).await?;
-    }
-    layer.freeze(lsn + 1).await;
-
-    if matches!(write_delta, WriteDelta::Yes) {
-        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
-            max_concurrency: NonZeroUsize::new(1).unwrap(),
-        });
-        let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
-            .await?
-            .unwrap();
-        tokio::fs::remove_file(path).await?;
-    }
-
-    Ok(())
-}
-
-/// Wrapper to instantiate a tokio runtime
-fn ingest_main(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
-    runtime.block_on(async move {
-        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
-        if let Err(e) = r {
-            panic!("{e:?}");
-        }
-    });
-}
-
-/// Declare a series of benchmarks for the Pageserver's ingest write path.
-///
-/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
-/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
-///
-/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
-/// a fast disk, CPU is the bottleneck at time of writing.
-fn criterion_benchmark(c: &mut Criterion) {
-    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
-    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
-    eprintln!("Data directory: {}", temp_dir.path());
-
-    let conf: &'static PageServerConf = Box::leak(Box::new(
-        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
-    ));
-    virtual_file::init(16384, virtual_file::io_engine_for_bench());
-    page_cache::init(conf.page_cache_size);
-
-    {
-        let mut group = c.benchmark_group("ingest-small-values");
-        let put_size = 100usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/100b seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Random,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::RandomReuse(0x3ff),
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-
-    {
-        let mut group = c.benchmark_group("ingest-big-values");
-        let put_size = 8192usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/8k seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,4 +1,3 @@
-use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -16,11 +15,7 @@ use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

-use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
-
-fn fixture_path(relative: &str) -> PathBuf {
-    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-}
+use criterion::{black_box, criterion_group, criterion_main, Criterion};

 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    let mut layer_map = LayerMap::default();
@@ -114,7 +109,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
    // TODO consider compressing this file
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);

    // Test with uniform query pattern
@@ -144,7 +139,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
    // Init layer map
    let now = Instant::now();
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    println!("Finished layer map init in {:?}", now.elapsed());

    // Choose uniformly distributed queries
@@ -247,72 +242,7 @@ fn bench_sequential(c: &mut Criterion) {
    group.finish();
 }

-fn bench_visibility_with_map(
-    group: &mut BenchmarkGroup<WallTime>,
-    layer_map: LayerMap,
-    read_points: Vec<Lsn>,
-    bench_name: &str,
-) {
-    group.bench_function(bench_name, |b| {
-        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
-    });
-}
-
-// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
-fn bench_visibility(c: &mut Criterion) {
-    let mut group = c.benchmark_group("visibility");
-    {
-        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
-        let now = Instant::now();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for i in 0..100_000 {
-            let i32 = (i as u32) % 100;
-            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-            let layer = PersistentLayerDesc::new_img(
-                TenantShardId::unsharded(TenantId::generate()),
-                TimelineId::generate(),
-                zero.add(10 * i32)..zero.add(10 * i32 + 1),
-                Lsn(i),
-                0,
-            );
-            updates.insert_historic(layer);
-        }
-        updates.flush();
-        println!("Finished layer map init in {:?}", now.elapsed());
-
-        let mut read_points = Vec::new();
-        for i in (0..100_000).step_by(1000) {
-            read_points.push(Lsn(i));
-        }
-
-        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
-    }
-
-    {
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![Lsn(0x1C760FA190)];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
-
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![
-            Lsn(0x1C760FA190),
-            Lsn(0x000000931BEAD539),
-            Lsn(0x000000931BF63011),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x00000038E67ABFA0),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x000000914E3F38F0),
-            Lsn(0x000000931B33AE68),
-        ];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
-    }
-
-    group.finish();
-}
-
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_group!(group_4, bench_visibility);
-criterion_main!(group_1, group_2, group_3, group_4);
+criterion_main!(group_1, group_2, group_3);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -29,9 +31,11 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
-    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -123,10 +127,8 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
    info!(?conf.get_impl, "starting with get page implementation");
    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -591,13 +593,30 @@ fn start_pageserver(

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -625,7 +644,7 @@ fn start_pageserver(
            shutdown_pageserver.take();
            pageserver::shutdown_pageserver(
                http_endpoint_listener,
-                page_service,
+                libpq_listener,
                consumption_metrics_tasks,
                disk_usage_eviction_task,
                &tenant_manager,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,7 +29,6 @@ use utils::{
    logging::LogFormat,
 };

-use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -53,7 +52,7 @@ pub mod defaults {
    use pageserver_api::models::ImageCompressionAlgorithm;
    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;

-    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
@@ -84,16 +83,16 @@ pub mod defaults {
    #[cfg(not(target_os = "linux"))]
    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";

-    pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
+    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

-    pub const DEFAULT_GET_IMPL: &str = "vectored";
+    pub const DEFAULT_GET_IMPL: &str = "legacy";

    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
        ImageCompressionAlgorithm::Disabled;

-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

@@ -296,13 +295,6 @@ pub struct PageServerConf {
    pub ephemeral_bytes_per_memory_kb: usize,

    pub l0_flush: L0FlushConfig,
-
-    /// This flag is temporary and will be removed after gradual rollout.
-    /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
-
-    /// Direct IO settings
-    pub virtual_file_direct_io: virtual_file::DirectIoMode,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -364,6 +356,8 @@ struct PageServerConfigBuilder {
    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

+    id: BuilderValue<NodeId>,
+
    broker_endpoint: BuilderValue<Uri>,
    broker_keepalive_interval: BuilderValue<Duration>,

@@ -409,15 +403,14 @@ struct PageServerConfigBuilder {
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,

    l0_flush: BuilderValue<L0FlushConfig>,
-
-    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
-
-    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
 }

 impl PageServerConfigBuilder {
-    fn new() -> Self {
-        Self::default()
+    fn new(node_id: NodeId) -> Self {
+        let mut this = Self::default();
+        this.id(node_id);
+
+        this
    }

    #[inline(always)]
@@ -445,6 +438,7 @@ impl PageServerConfigBuilder {
            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
            remote_storage_config: Set(None),
+            id: NotSet,
            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                .parse()
                .expect("failed to parse default broker endpoint")),
@@ -502,8 +496,6 @@ impl PageServerConfigBuilder {
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
-            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
-            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
        }
    }
 }
@@ -576,6 +568,10 @@ impl PageServerConfigBuilder {
        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
    }

+    pub fn id(&mut self, node_id: NodeId) {
+        self.id = BuilderValue::Set(node_id)
+    }
+
    pub fn log_format(&mut self, log_format: LogFormat) {
        self.log_format = BuilderValue::Set(log_format)
    }
@@ -687,15 +683,7 @@ impl PageServerConfigBuilder {
        self.l0_flush = BuilderValue::Set(value);
    }

-    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
-        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
-    }
-
-    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
-        self.virtual_file_direct_io = BuilderValue::Set(value);
-    }
-
-    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
+    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

        macro_rules! conf {
@@ -728,6 +716,7 @@ impl PageServerConfigBuilder {
                pg_auth_type,
                auth_validation_public_key_path,
                remote_storage_config,
+                id,
                broker_endpoint,
                broker_keepalive_interval,
                log_format,
@@ -752,12 +741,9 @@ impl PageServerConfigBuilder {
                image_compression,
                ephemeral_bytes_per_memory_kb,
                l0_flush,
-                compact_level0_phase1_value_access,
-                virtual_file_direct_io,
            }
            CUSTOM LOGIC
            {
-                id: id,
                // TenantConf is handled separately
                default_tenant_conf: TenantConf::default(),
                concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -907,7 +893,7 @@ impl PageServerConf {
        toml: &Document,
        workdir: &Utf8Path,
    ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new();
+        let mut builder = PageServerConfigBuilder::new(node_id);
        builder.workdir(workdir.to_owned());

        let mut t_conf = TenantConfOpt::default();
@@ -938,6 +924,8 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
+                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
+                            // Logging is not set up yet, so we can't do it.
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -1026,17 +1014,11 @@ impl PageServerConf {
                "l0_flush" => {
                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
                }
-                "compact_level0_phase1_value_access" => {
-                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
-                }
-                "virtual_file_direct_io" => {
-                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }

-        let mut conf = builder.build(node_id).context("invalid config")?;
+        let mut conf = builder.build().context("invalid config")?;

        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
            let auth_validation_public_key_path = conf
@@ -1116,8 +1098,6 @@ impl PageServerConf {
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
        }
    }
 }
@@ -1275,6 +1255,7 @@ max_file_descriptors = 333

 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
+id = 10

 metric_collection_interval = '222 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1291,8 +1272,9 @@ background_task_maximum_delay = '334 s'
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
        // we have to create dummy values to overcome the validation errors
-        let config_string =
-            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
+        let config_string = format!(
+            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
+        );
        let toml = config_string.parse()?;

        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1359,8 +1341,6 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1435,8 +1415,6 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1601,6 +1579,7 @@ broker_endpoint = '{broker_endpoint}'
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [disk_usage_based_eviction]
 max_usage_pct = 80
@@ -1670,6 +1649,7 @@ threshold = "20m"
            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
+id = 222

 [tenant_config]
 evictions_low_residence_duration_metric_threshold = "20m"
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,45 +308,6 @@ paths:
            application/json:
              schema:
                type: string
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently add a gc blocking at the tenant level because of this timeline
-      responses:
-        "200":
-          description: OK
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently remove a tenant level gc blocking for this timeline
-      responses:
-        "200":
-          description: OK
-
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -932,7 +893,7 @@ components:
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
    ArchivalConfigRequest:
      type: object
-      required:
+      required
        - state
      properties:
        state:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => {
                ApiError::ResourceUnavailable(format!("{}", e).into())
            }
-            GetActiveTenantError::SwitchedTenant => {
-                // in our HTTP handlers, this error doesn't happen
-                // TODO: separate error types
-                ApiError::ResourceUnavailable("switched tenant".into())
-            }
        }
    }
 }
@@ -935,7 +930,6 @@ async fn tenant_list_handler(
            generation: (*gen)
                .into()
                .expect("Tenants are always attached with a generation"),
-            gc_blocking: None,
        })
        .collect::<Vec<TenantInfo>>();

@@ -987,7 +981,6 @@ async fn tenant_status(
                    .generation()
                    .into()
                    .expect("Tenants are always attached with a generation"),
-                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1162,10 +1155,7 @@ async fn layer_map_info_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
-    let layer_map_info = timeline
-        .layer_map_info(reset)
-        .await
-        .map_err(|_shutdown| ApiError::ShuttingDown)?;
+    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
 }
@@ -1231,72 +1221,6 @@ async fn evict_timeline_layer_handler(
    }
 }

-async fn timeline_gc_blocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, true).await
-}
-
-async fn timeline_gc_unblocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, false).await
-}
-
-/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
-///
-/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
-async fn block_or_unblock_gc(
-    request: Request<Body>,
-    block: bool,
-) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::{
-        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
-    };
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let state = get_state(&request);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    let timeline = tenant.get_timeline(timeline_id, true)?;
-
-    let fut = async {
-        if block {
-            timeline.block_gc(&tenant).await.map(|_| ())
-        } else {
-            timeline.unblock_gc(&tenant).await
-        }
-    };
-
-    let span = tracing::info_span!(
-        "block_or_unblock_gc",
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %timeline_id,
-        block = block,
-    );
-
-    let res = fut.instrument(span).await;
-
-    res.map_err(|e| {
-        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
-            ApiError::ShuttingDown
-        } else {
-            ApiError::InternalServerError(e)
-        }
-    })?;
-
-    json_response(StatusCode::OK, ())
-}
-
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
    inputs: ModelInputs,
@@ -1887,7 +1811,7 @@ async fn timeline_detach_ancestor_handler(
        // drop(tenant);

        let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+            detach_ancestor::Progress::Prepared(attempt, prepared) => {
                // it would be great to tag the guard on to the tenant activation future
                let reparented_timelines = state
                    .tenant_manager
@@ -1895,10 +1819,10 @@ async fn timeline_detach_ancestor_handler(
                        tenant_shard_id,
                        timeline_id,
                        prepared,
+                        attempt,
                        ctx,
                    )
                    .await
-                    .context("timeline detach ancestor completion")
                    .map_err(ApiError::InternalServerError)?;

                AncestorDetached {
@@ -2205,24 +2129,14 @@ async fn secondary_download_handler(

    let timeout = wait.unwrap_or(Duration::MAX);

-    let result = tokio::time::timeout(
+    let status = match tokio::time::timeout(
        timeout,
        state.secondary_controller.download_tenant(tenant_shard_id),
    )
-    .await;
-
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
-    let status = match result {
-        Ok(Ok(())) => {
-            if progress.layers_downloaded >= progress.layers_total {
-                // Download job ran to completion
-                StatusCode::OK
-            } else {
-                // Download dropped out without errors because it ran out of time budget
-                StatusCode::ACCEPTED
-            }
-        }
+    .await
+    {
+        // Download job ran to completion.
+        Ok(Ok(())) => StatusCode::OK,
        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
        // okay.  We could get an error here in the unlikely edge case that the tenant
        // was detached between our check above and executing the download job.
@@ -2232,6 +2146,8 @@ async fn secondary_download_handler(
        Err(_) => StatusCode::ACCEPTED,
    };

+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
    json_response(status, progress)
 }

@@ -2975,14 +2891,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
-            |r| api_handler(r, timeline_gc_blocking_handler),
-        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
-            |r| api_handler(r, timeline_gc_unblocking_handler),
-        )
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -2,29 +2,19 @@ use std::{num::NonZeroUsize, sync::Arc};

 use crate::tenant::ephemeral_file;

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
+    #[default]
    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
-}
-
-impl Default for L0FlushConfig {
-    fn default() -> Self {
-        Self::Direct {
-            // TODO: using num_cpus results in different peak memory usage on different instance types.
-            max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
-        }
-    }
+    Direct { max_concurrency: NonZeroUsize },
 }

 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

-pub enum Inner {
+pub(crate) enum Inner {
    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +30,7 @@ impl L0FlushGlobalState {
        }
    }

-    pub fn inner(&self) -> &Arc<Inner> {
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
        &self.0
    }
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
-
-use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -32,13 +30,14 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
    mgr::{BackgroundPurges, TenantManager},
    secondary,
 };
-use tracing::{info, info_span};
+use tracing::info;

 /// Current storage format version
 ///
@@ -64,6 +63,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
-    page_service: page_service::Listener,
+    libpq_listener: LibpqEndpointListener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
    exit_code: i32,
 ) {
    use std::time::Duration;
-
-    // If the orderly shutdown below takes too long, we still want to make
-    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
-    //
-    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
-    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
-    //
-    // We use a thread instead of a tokio task because the background runtime is likely busy
-    // with the final flushing / uploads. This activity here has priority, and due to lack
-    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
-    // an effective priority booster.
-    let walredo_extraordinary_shutdown_thread_span = {
-        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
-        span.follows_from(tracing::Span::current());
-        span
-    };
-    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
-    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
-        let walredo_extraordinary_shutdown_thread_cancel =
-            walredo_extraordinary_shutdown_thread_cancel.clone();
-        move || {
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .unwrap();
-            let _entered = rt.enter();
-            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
-            if let Ok(()) = rt.block_on(tokio::time::timeout(
-                Duration::from_secs(8),
-                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
-            )) {
-                info!("cancellation requested");
-                return;
-            }
-            let managers = tenant::WALREDO_MANAGERS
-                .lock()
-                .unwrap()
-                // prevents new walredo managers from being inserted
-                .take()
-                .expect("only we take()");
-            // Use FuturesUnordered to get in queue early for each manager's
-            // heavier_once_cell semaphore wait list.
-            // Also, for idle tenants that for some reason haven't
-            // shut down yet, it's quite likely that we're not going
-            // to get Poll::Pending once.
-            let mut futs: FuturesUnordered<_> = managers
-                .into_iter()
-                .filter_map(|(_, mgr)| mgr.upgrade())
-                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
-                .collect();
-            info!(count=%futs.len(), "built FuturesUnordered");
-            let mut last_log_at = std::time::Instant::now();
-            #[derive(Debug, Default)]
-            struct Results {
-                initiated: u64,
-                already: u64,
-            }
-            let mut results = Results::default();
-            while let Some(we_initiated) = rt.block_on(futs.next()) {
-                if we_initiated {
-                    results.initiated += 1;
-                } else {
-                    results.already += 1;
-                }
-                if last_log_at.elapsed() > Duration::from_millis(100) {
-                    info!(remaining=%futs.len(), ?results, "progress");
-                    last_log_at = std::time::Instant::now();
-                }
-            }
-            info!(?results, "done");
-        }
-    });
-
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    let remaining_connections = timed(
-        page_service.stop_accepting(),
+    timed(
+        libpq_listener.0.shutdown(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
-        remaining_connections.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
        Duration::from_secs(1),
    )
    .await;
-
-    info!("cancel & join walredo_extraordinary_shutdown_thread");
-    walredo_extraordinary_shutdown_thread_cancel.cancel();
-    walredo_extraordinary_shutdown_thread.join().unwrap();
-    info!("walredo_extraordinary_shutdown_thread done");
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,15 +525,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_visible_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
        "pageserver_resident_physical_size_global",
@@ -622,23 +613,7 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_compression_image_in_bytes_total",
-        "Size of data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_considered",
-        "Size of potentially compressible data written into image layers before compression"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_chosen",
-        "Size of data whose compressed form was written into image layers"
+        "Size of uncompressed data written into image layers"
    )
    .expect("failed to define a metric")
 });
@@ -2213,7 +2188,6 @@ pub(crate) struct TimelineMetrics {
    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
-    pub visible_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub aux_file_size_gauge: IntGauge,
@@ -2336,9 +2310,6 @@ impl TimelineMetrics {
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2393,7 +2364,6 @@ impl TimelineMetrics {
            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
-            visible_physical_size_gauge,
            current_logical_size_gauge,
            aux_file_size_gauge,
            directory_entries_count_gauge,
@@ -2445,7 +2415,6 @@ impl TimelineMetrics {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
-        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -8,7 +8,8 @@ use std::time::Duration;
 pub use pageserver_api::key::{Key, KEY_SIZE};

 /// A 'value' stored for a one Key.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(test, derive(PartialEq))]
 pub enum Value {
    /// An Image value contains a full copy of the value
    Image(Bytes),
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,6 +56,7 @@ impl Statvfs {
 }

 pub mod mock {
+    use anyhow::Context;
    use camino::Utf8Path;
    use regex::Regex;
    use tracing::log::info;
@@ -134,30 +135,14 @@ pub mod mock {
            {
                continue;
            }
-            let m = match entry.metadata() {
-                Ok(m) => m,
-                Err(e) if is_not_found(&e) => {
-                    // some temp file which got removed right as we are walking
-                    continue;
-                }
-                Err(e) => {
-                    return Err(anyhow::Error::new(e)
-                        .context(format!("get metadata of {:?}", entry.path())))
-                }
-            };
-            total += m.len();
+            total += entry
+                .metadata()
+                .with_context(|| format!("get metadata of {:?}", entry.path()))?
+                .len();
        }
        Ok(total)
    }

-    fn is_not_found(e: &walkdir::Error) -> bool {
-        let Some(io_error) = e.io_error() else {
-            return false;
-        };
-        let kind = io_error.kind();
-        matches!(kind, std::io::ErrorKind::NotFound)
-    }
-
    pub struct Statvfs {
        pub blocks: u64,
        pub blocks_available: u64,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,12 +28,6 @@ use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-#[derive(Copy, Clone, Debug)]
-pub struct CompressionInfo {
-    pub written_compressed: bool,
-    pub compressed_size: Option<usize>,
-}
-
 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
@@ -279,10 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        let (buf, res) = self
-            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await;
-        (buf, res.map(|(off, _compression_info)| off))
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -292,12 +284,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
        algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
+    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;
-        let mut compression_info = CompressionInfo {
-            written_compressed: false,
-            compressed_size: None,
-        };

        let len = srcbuf.bytes_init();

@@ -340,9 +328,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        encoder.write_all(&slice[..]).await.unwrap();
                        encoder.shutdown().await.unwrap();
                        let compressed = encoder.into_inner();
-                        compression_info.compressed_size = Some(compressed.len());
                        if compressed.len() < len {
-                            compression_info.written_compressed = true;
                            let compressed_len = compressed.len();
                            compressed_buf = Some(compressed);
                            (BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -373,7 +359,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        } else {
            self.write_all(srcbuf, ctx).await
        };
-        (srcbuf, res.map(|_| (offset, compression_info)))
+        (srcbuf, res.map(|_| offset))
    }
 }

@@ -430,14 +416,12 @@ pub(crate) mod tests {
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
-                    let res = wtr
-                        .write_blob_maybe_compressed(
-                            blob.clone(),
-                            ctx,
-                            ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                        )
-                        .await;
-                    (res.0, res.1.map(|(off, _)| off))
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        ctx,
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                    )
+                    .await
                } else {
                    wtr.write_blob(blob.clone(), ctx).await
                };
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -296,19 +296,13 @@ where
            let mut stack = Vec::new();
            stack.push((self.root_blk, None));
            let block_cursor = self.reader.block_cursor();
-            let mut node_buf = [0_u8; PAGE_SZ];
            while let Some((node_blknum, opt_iter)) = stack.pop() {
-                // Read the node, through the PS PageCache, into local variable `node_buf`.
-                // We could keep the page cache read guard alive, but, at the time of writing,
-                // we run quite small PS PageCache s => can't risk running out of
-                // PageCache space because this stream isn't consumed fast enough.
-                let page_read_guard = block_cursor
+                // Locate the node.
+                let node_buf = block_cursor
                    .read_blk(self.start_blk + node_blknum, ctx)
                    .await?;
-                node_buf.copy_from_slice(page_read_guard.as_ref());
-                drop(page_read_guard); // drop page cache read guard early

-                let node = OnDiskNode::deparse(&node_buf)?;
+                let node = OnDiskNode::deparse(node_buf.as_ref())?;
                let prefix_len = node.prefix_len as usize;
                let suffix_len = node.suffix_len as usize;

@@ -351,7 +345,6 @@ where
                    Either::Left(idx..node.num_children.into())
                };

-
                // idx points to the first match now. Keep going from there
                while let Some(idx) = iter.next() {
                    let key_off = idx * suffix_len;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -29,7 +29,6 @@ impl EphemeralFile {
        conf: &PageServerConf,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<EphemeralFile, io::Error> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -52,12 +51,10 @@ impl EphemeralFile {
        )
        .await?;

-        let prewarm = conf.l0_flush.prewarm_on_write();
-
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
        })
    }

@@ -164,11 +161,7 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;

-        let gate = utils::sync::gate::Gate::default();
-
-        let entered = gate.enter().unwrap();
-
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;

        let pos_foo = file.write_blob(b"foo", &ctx).await?;
        assert_eq!(
@@ -222,38 +215,4 @@ mod tests {

        Ok(())
    }
-
-    #[tokio::test]
-    async fn ephemeral_file_holds_gate_open() {
-        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
-
-        let (conf, tenant_id, timeline_id, ctx) =
-            harness("ephemeral_file_holds_gate_open").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-            .await
-            .unwrap();
-
-        let mut closing = tokio::task::spawn(async move {
-            gate.close().await;
-        });
-
-        // gate is entered until the ephemeral file is dropped
-        // do not start paused tokio-epoll-uring has a sleep loop
-        tokio::time::pause();
-        tokio::time::timeout(FOREVER, &mut closing)
-            .await
-            .expect_err("closing cannot complete before dropping");
-
-        // this is a requirement of the reset_tenant functionality: we have to be able to restart a
-        // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
-        drop(file);
-
-        tokio::time::timeout(FOREVER, &mut closing)
-            .await
-            .expect("closing completes right away")
-            .expect("closing does not panic");
-    }
 }
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -18,8 +18,6 @@ use super::zero_padded_read_write;
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
-    _gate_guard: utils::sync::gate::GateGuard,
 }

 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -31,11 +29,7 @@ pub enum PrewarmOnWrite {
 }

 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
@@ -44,7 +38,6 @@ impl RW {
                file,
                prewarm_on_write,
            )),
-            _gate_guard,
        }
    }

@@ -152,7 +145,6 @@ impl Drop for RW {
        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.

        // unlink the file
-        // we are clear to do this, because we have entered a gate
        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,213 +0,0 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
-
-use super::remote_timeline_client::index::GcBlockingReason;
-
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
-
-#[derive(Default)]
-pub(crate) struct GcBlock {
-    /// The timelines which have current reasons to block gc.
-    ///
-    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
-    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
-    reasons: std::sync::Mutex<Storage>,
-    blocking: tokio::sync::Mutex<()>,
-}
-
-impl GcBlock {
-    /// Start another gc iteration.
-    ///
-    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
-    /// it's ending, or if not currently possible, a value describing the reasons why not.
-    ///
-    /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
-        let reasons = {
-            let g = self.reasons.lock().unwrap();
-
-            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
-            // tests, we use everything. we should warn if the gc has been consecutively blocked
-            // for more than 1h (within single tenant session?).
-            BlockingReasons::clean_and_summarize(g)
-        };
-
-        if let Some(reasons) = reasons {
-            Err(reasons)
-        } else {
-            Ok(Guard {
-                _inner: self.blocking.lock().await,
-            })
-        }
-    }
-
-    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
-        let g = self.reasons.lock().unwrap();
-
-        BlockingReasons::summarize(&g)
-    }
-
-    /// Start blocking gc for this one timeline for the given reason.
-    ///
-    /// This is not a guard based API but instead it mimics set API. The returned future will not
-    /// resolve until an existing gc round has completed.
-    ///
-    /// Returns true if this block was new, false if gc was already blocked for this reason.
-    ///
-    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
-    /// keep the gc blocking reason.
-    pub(crate) async fn insert(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<bool> {
-        let (added, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
-            let added = set.insert(reason);
-
-            // LOCK ORDER: intentionally hold the lock, see self.reasons.
-            let uploaded = timeline
-                .remote_client
-                .schedule_insert_gc_block_reason(reason)?;
-
-            (added, uploaded)
-        };
-
-        uploaded.await?;
-
-        // ensure that any ongoing gc iteration has completed
-        drop(self.blocking.lock().await);
-
-        Ok(added)
-    }
-
-    /// Remove blocking gc for this one timeline and the given reason.
-    pub(crate) async fn remove(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<()> {
-        use std::collections::hash_map::Entry;
-
-        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
-
-        let (remaining_blocks, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
-                Entry::Occupied(mut oe) => {
-                    let set = oe.get_mut();
-                    set.remove(reason);
-                    if set.is_empty() {
-                        oe.remove();
-                    }
-                }
-                Entry::Vacant(_) => {
-                    // we must still do the index_part.json update regardless, in case we had earlier
-                    // been cancelled
-                }
-            }
-
-            let remaining_blocks = g.len();
-
-            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
-            let uploaded = timeline
-                .remote_client
-                .schedule_remove_gc_block_reason(reason)?;
-
-            (remaining_blocks, uploaded)
-        };
-        uploaded.await?;
-
-        // no need to synchronize with gc iteration again
-
-        if remaining_blocks > 0 {
-            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
-        } else {
-            tracing::info!("gc is now unblocked for the tenant");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
-        let unblocked = {
-            let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
-                return;
-            }
-
-            g.remove(&timeline.timeline_id);
-
-            BlockingReasons::clean_and_summarize(g).is_none()
-        };
-
-        if unblocked {
-            tracing::info!("gc is now unblocked following deletion");
-        }
-    }
-
-    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
-        let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
-
-        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
-            tracing::info!(summary=?reasons, "initialized with gc blocked");
-        }
-    }
-}
-
-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
-}
-
-#[derive(Debug)]
-pub(crate) struct BlockingReasons {
-    timelines: usize,
-    reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-impl std::fmt::Display for BlockingReasons {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
-        )
-    }
-}
-
-impl BlockingReasons {
-    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
-            reasons = reasons.union(*value);
-            !value.is_empty()
-        });
-        if !g.is_empty() {
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        } else {
-            None
-        }
-    }
-
-    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
-            None
-        } else {
-            let reasons = g
-                .values()
-                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,8 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
-use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
+use pageserver_api::keyspace::KeySpaceAccum;
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -62,7 +61,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;

-use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
+use super::storage_layer::PersistentLayerDesc;

 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -846,8 +845,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
-        &self.l0_delta_layers
+    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
+        self.l0_delta_layers.to_vec()
    }

    /// debugging function to print out the contents of the layer map
@@ -872,183 +871,11 @@ impl LayerMap {
        println!("End dump LayerMap");
        Ok(())
    }
-
-    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
-    /// where we expect to serve reads.
-    ///
-    /// This function is O(N) and should be called infrequently.  The caller is responsible for
-    /// looking up and updating the Layer objects for these layer descriptors.
-    pub fn get_visibility(
-        &self,
-        mut read_points: Vec<Lsn>,
-    ) -> (
-        Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
-        KeySpace,
-    ) {
-        // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
-        // KeySpace is intended to be composed statically and iterated over.
-        struct KeyShadow {
-            // Map of range start to range end
-            inner: RangeSetBlaze<i128>,
-        }
-
-        impl KeyShadow {
-            fn new() -> Self {
-                Self {
-                    inner: Default::default(),
-                }
-            }
-
-            fn contains(&self, range: Range<Key>) -> bool {
-                let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
-                self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
-                    CheckSortedDisjoint::from([range_incl]),
-                ))
-            }
-
-            /// Add the input range to the keys covered by self.
-            ///
-            /// Return true if inserting this range covered some keys that were previously not covered
-            fn cover(&mut self, insert: Range<Key>) -> bool {
-                let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
-                self.inner.ranges_insert(range_incl)
-            }
-
-            fn reset(&mut self) {
-                self.inner = Default::default();
-            }
-
-            fn to_keyspace(&self) -> KeySpace {
-                let mut accum = KeySpaceAccum::new();
-                for range_incl in self.inner.ranges() {
-                    let range = Range {
-                        start: Key::from_i128(*range_incl.start()),
-                        end: Key::from_i128(range_incl.end() + 1),
-                    };
-                    accum.add_range(range)
-                }
-
-                accum.to_keyspace()
-            }
-        }
-
-        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
-        // and a ReadPoint
-        read_points.sort_by_key(|rp| rp.0);
-        let mut shadow = KeyShadow::new();
-
-        // We will interleave all our read points and layers into a sorted collection
-        enum Item {
-            ReadPoint { lsn: Lsn },
-            Layer(Arc<PersistentLayerDesc>),
-        }
-
-        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
-        items.extend(self.iter_historic_layers().map(Item::Layer));
-        items.extend(
-            read_points
-                .into_iter()
-                .map(|rp| Item::ReadPoint { lsn: rp }),
-        );
-
-        // Ordering: we want to iterate like this:
-        // 1. Highest LSNs first
-        // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
-        // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
-        items.sort_by_key(|item| {
-            std::cmp::Reverse(match item {
-                Item::Layer(layer) => {
-                    if layer.is_delta() {
-                        (Lsn(layer.get_lsn_range().end.0 - 1), 0)
-                    } else {
-                        (layer.image_layer_lsn(), 1)
-                    }
-                }
-                Item::ReadPoint { lsn } => (*lsn, 2),
-            })
-        });
-
-        let mut results = Vec::with_capacity(self.historic.len());
-
-        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
-
-        for item in items {
-            let (reached_lsn, is_readpoint) = match &item {
-                Item::ReadPoint { lsn } => (lsn, true),
-                Item::Layer(layer) => (&layer.lsn_range.start, false),
-            };
-            maybe_covered_deltas.retain(|d| {
-                if *reached_lsn >= d.lsn_range.start && is_readpoint {
-                    // We encountered a readpoint within the delta layer: it is visible
-
-                    results.push((d.clone(), LayerVisibilityHint::Visible));
-                    false
-                } else if *reached_lsn < d.lsn_range.start {
-                    // We passed the layer's range without encountering a read point: it is not visible
-                    results.push((d.clone(), LayerVisibilityHint::Covered));
-                    false
-                } else {
-                    // We're still in the delta layer: continue iterating
-                    true
-                }
-            });
-
-            match item {
-                Item::ReadPoint { lsn: _lsn } => {
-                    // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
-                    // to assume that the whole key range is visible at the branch point.
-                    shadow.reset();
-                }
-                Item::Layer(layer) => {
-                    let visibility = if layer.is_delta() {
-                        if shadow.contains(layer.get_key_range()) {
-                            // If a layer isn't visible based on current state, we must defer deciding whether
-                            // it is truly not visible until we have advanced past the delta's range: we might
-                            // encounter another branch point within this delta layer's LSN range.
-                            maybe_covered_deltas.push(layer);
-                            continue;
-                        } else {
-                            LayerVisibilityHint::Visible
-                        }
-                    } else {
-                        let modified = shadow.cover(layer.get_key_range());
-                        if modified {
-                            // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
-                            LayerVisibilityHint::Visible
-                        } else {
-                            // An image layer in a region that was already covered
-                            LayerVisibilityHint::Covered
-                        }
-                    };
-
-                    results.push((layer, visibility));
-                }
-            }
-        }
-
-        // Drain any remaining maybe_covered deltas
-        results.extend(
-            maybe_covered_deltas
-                .into_iter()
-                .map(|d| (d, LayerVisibilityHint::Covered)),
-        );
-
-        (results, shadow.to_keyspace())
-    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::tenant::{storage_layer::LayerName, IndexPart};
-    use pageserver_api::{
-        key::DBDIR_KEY,
-        keyspace::{KeySpace, KeySpaceRandomAccum},
-    };
-    use std::{collections::HashMap, path::PathBuf};
-    use utils::{
-        id::{TenantId, TimelineId},
-        shard::TenantShardId,
-    };
+    use pageserver_api::keyspace::KeySpace;

    use super::*;

@@ -1175,299 +1002,4 @@ mod tests {
            }
        }
    }
-
-    #[test]
-    fn layer_visibility_basic() {
-        // A simple synthetic input, as a smoke test.
-        let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
-        let timeline_id = TimelineId::generate();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-
-        const FAKE_LAYER_SIZE: u64 = 1024;
-
-        let inject_delta = |updates: &mut BatchedUpdates,
-                            key_start: i128,
-                            key_end: i128,
-                            lsn_start: u64,
-                            lsn_end: u64| {
-            let desc = PersistentLayerDesc::new_delta(
-                tenant_shard_id,
-                timeline_id,
-                Range {
-                    start: Key::from_i128(key_start),
-                    end: Key::from_i128(key_end),
-                },
-                Range {
-                    start: Lsn(lsn_start),
-                    end: Lsn(lsn_end),
-                },
-                1024,
-            );
-            updates.insert_historic(desc.clone());
-            desc
-        };
-
-        let inject_image =
-            |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
-                let desc = PersistentLayerDesc::new_img(
-                    tenant_shard_id,
-                    timeline_id,
-                    Range {
-                        start: Key::from_i128(key_start),
-                        end: Key::from_i128(key_end),
-                    },
-                    Lsn(lsn),
-                    FAKE_LAYER_SIZE,
-                );
-                updates.insert_historic(desc.clone());
-                desc
-            };
-
-        //
-        // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
-        // we expect to handle.  You can follow these examples through in the same order as they would be processed
-        // by the function under test.
-        //
-
-        let mut read_points = vec![Lsn(1000)];
-
-        // A delta ahead of any image layer
-        let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
-
-        // An image layer is visible and covers some layers beneath itself
-        let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
-
-        // A delta layer covered by the image layer: should be covered
-        let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
-
-        // A delta layer partially covered by an image layer: should be visible
-        let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
-
-        // A delta layer not covered by an image layer: should be visible
-        let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
-
-        // An image layer covered by the image layer above: should be covered
-        let covered_image = inject_image(&mut updates, 10, 20, 89);
-
-        // An image layer partially covered by an image layer: should be visible
-        let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
-
-        // An image layer not covered by an image layer: should be visible
-        let not_covered_image = inject_image(&mut updates, 1, 4, 89);
-
-        // A read point: this will make subsequent layers below here visible, even if there are
-        // more recent layers covering them.
-        read_points.push(Lsn(80));
-
-        // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
-        let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
-
-        // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
-        // the read point should make it visible, even though its end LSN is covered
-        let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
-        let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
-        read_points.push(Lsn(65));
-        let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
-
-        let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
-
-        updates.flush();
-
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        assert_eq!(
-            layer_visibilities.get(&ahead_layer),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_covering_img),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_image),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_below_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covering_img_between_read_points),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_between_read_points),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_intersects_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_img_after_last_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-
-        // Shadow should include all the images below the last read point
-        let expected_shadow = KeySpace {
-            ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
-        };
-        assert_eq!(shadow, expected_shadow);
-    }
-
-    fn fixture_path(relative: &str) -> PathBuf {
-        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-    }
-
-    #[test]
-    fn layer_visibility_realistic() {
-        // Load a large example layermap
-        let index_raw = std::fs::read_to_string(fixture_path(
-            "test_data/indices/mixed_workload/index_part.json",
-        ))
-        .unwrap();
-        let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
-
-        let tenant_id = TenantId::generate();
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-        let timeline_id = TimelineId::generate();
-
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for (layer_name, layer_metadata) in index.layer_metadata {
-            let layer_desc = match layer_name {
-                LayerName::Image(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range.clone(),
-                    lsn_range: layer_name.lsn_as_range(),
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: false,
-                    file_size: layer_metadata.file_size,
-                },
-                LayerName::Delta(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range,
-                    lsn_range: layer_name.lsn_range,
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: true,
-                    file_size: layer_metadata.file_size,
-                },
-            };
-            updates.insert_historic(layer_desc);
-        }
-        updates.flush();
-
-        let read_points = vec![index.metadata.disk_consistent_lsn()];
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        for (layer_desc, visibility) in &layer_visibilities {
-            tracing::info!("{layer_desc:?}: {visibility:?}");
-            eprintln!("{layer_desc:?}: {visibility:?}");
-        }
-
-        // The shadow should be non-empty, since there were some image layers
-        assert!(!shadow.ranges.is_empty());
-
-        // At least some layers should be marked covered
-        assert!(layer_visibilities
-            .iter()
-            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
-
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
-        for (layer_desc, visible) in &layer_visibilities {
-            let mut coverage = KeySpaceRandomAccum::new();
-            let mut covered_by = Vec::new();
-
-            for other_layer in layer_map.iter_historic_layers() {
-                if &other_layer == layer_desc {
-                    continue;
-                }
-                if !other_layer.is_delta()
-                    && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
-                    && other_layer.key_range.start <= layer_desc.key_range.end
-                    && layer_desc.key_range.start <= other_layer.key_range.end
-                {
-                    coverage.add_range(other_layer.get_key_range());
-                    covered_by.push((*other_layer).clone());
-                }
-            }
-            let coverage = coverage.to_keyspace();
-
-            let expect_visible = if coverage.ranges.len() == 1
-                && coverage.contains(&layer_desc.key_range.start)
-                && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
-            {
-                LayerVisibilityHint::Covered
-            } else {
-                LayerVisibilityHint::Visible
-            };
-
-            if expect_visible != *visible {
-                eprintln!(
-                    "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
-                    layer_desc.key_range.start,
-                    layer_desc.key_range.end,
-                    layer_desc.lsn_range.start,
-                    layer_desc.lsn_range.end,
-                    layer_desc.is_delta()
-                );
-                if expect_visible == LayerVisibilityHint::Covered {
-                    eprintln!("Covered by:");
-                    for other in covered_by {
-                        eprintln!(
-                            "  {}..{} @ {}",
-                            other.get_key_range().start,
-                            other.get_key_range().end,
-                            other.image_layer_lsn()
-                        );
-                    }
-                    if let Some(range) = coverage.ranges.first() {
-                        eprintln!(
-                            "Total coverage from contributing layers: {}..{}",
-                            range.start, range.end
-                        );
-                    } else {
-                        eprintln!(
-                            "Total coverage from contributing layers: {:?}",
-                            coverage.ranges
-                        );
-                    }
-                }
-            }
-            assert_eq!(expect_visible, *visible);
-        }
-
-        // Sanity: the layer that holds latest data for the DBDIR key should always be visible
-        // (just using this key as a key that will always exist for any layermap fixture)
-        let dbdir_layer = layer_map
-            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
-            .unwrap();
-        assert!(matches!(
-            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
-            LayerVisibilityHint::Visible
-        ));
-    }
 }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,10 +521,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {

        Ok(&self.historic_coverage)
    }
-
-    pub(crate) fn len(&self) -> usize {
-        self.layers.len()
-    }
 }

 #[test]
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
 #[error("re-serializing for crc32 failed")]
 struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);

-const METADATA_HDR_SIZE: usize = size_of::<TimelineMetadataHeader>();
+const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 struct TimelineMetadataBodyV2 {
@@ -285,21 +285,30 @@ impl TimelineMetadata {
    }

    /// When reparenting, the `ancestor_lsn` does not change.
-    pub fn reparent(&mut self, timeline: &TimelineId) {
+    ///
+    /// Returns true if anything was changed.
+    pub fn reparent(&mut self, timeline: &TimelineId) -> bool {
        assert!(self.body.ancestor_timeline.is_some());
        // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
+        let prev = self.body.ancestor_timeline;
        self.body.ancestor_timeline = Some(*timeline);
+        prev.as_ref() != Some(timeline)
    }

-    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
+    /// Returns true if anything was changed
+    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
+        let mut changed = false;
        if let Some(ancestor) = self.body.ancestor_timeline {
            assert_eq!(ancestor, branchpoint.0);
+            changed = true;
        }
        if self.body.ancestor_lsn != Lsn(0) {
            assert_eq!(self.body.ancestor_lsn, branchpoint.1);
+            changed = true;
        }
        self.body.ancestor_timeline = None;
        self.body.ancestor_lsn = Lsn(0);
+        changed
    }

    pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -54,8 +54,8 @@ use utils::id::{TenantId, TimelineId};

 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::PreparedTimelineDetach;
-use super::{GlobalShutDown, TenantSharedResources};
+use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
+use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
    /// Pick the shard that holds this key
    Page(Key),
    /// The shard ID is known: pick the given shard
@@ -224,8 +226,21 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }

 /// See [`Self::spawn`].
-#[derive(Clone, Default)]
-pub struct BackgroundPurges(tokio_util::task::TaskTracker);
+#[derive(Clone)]
+pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
+enum BackgroundPurgesInner {
+    Open(tokio::task::JoinSet<()>),
+    // we use the async mutex for coalescing
+    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
+}
+
+impl Default for BackgroundPurges {
+    fn default() -> Self {
+        Self(Arc::new(std::sync::Mutex::new(
+            BackgroundPurgesInner::Open(JoinSet::new()),
+        )))
+    }
+}

 impl BackgroundPurges {
    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -234,32 +249,24 @@ impl BackgroundPurges {
    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
-        //
-        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
-        // right after, but at least the shutdown will wait for what we are spawning next.
-        let token = self.0.token();
-
-        if self.0.is_closed() {
-            warn!(
-                %tmp_path,
-                "trying to spawn background purge during shutdown, ignoring"
-            );
-            return;
-        }
-
-        let span = info_span!(parent: None, "background_purge", %tmp_path);
-
-        let task = move || {
-            let _token = token;
-            let _entered = span.entered();
-            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
-                // should we fatal_io_error here?
-                warn!(%error, "failed to purge tenant directory");
+        let mut guard = self.0.lock().unwrap();
+        let jset = match &mut *guard {
+            BackgroundPurgesInner::Open(ref mut jset) => jset,
+            BackgroundPurgesInner::ShuttingDown(_) => {
+                warn!("trying to spawn background purge during shutdown, ignoring");
+                return;
            }
        };
-
-        BACKGROUND_RUNTIME.spawn_blocking(task);
+        jset.spawn_on(
+            async move {
+                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
+                    // should we fatal_io_error here?
+                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
+                }
+            }
+            .instrument(info_span!(parent: None, "background_purge")),
+            BACKGROUND_RUNTIME.handle(),
+        );
    }

    /// When this future completes, all background purges have completed.
@@ -273,9 +280,42 @@ impl BackgroundPurges {
    /// instances of this future will continue to be correct.
    #[instrument(skip_all)]
    pub async fn shutdown(&self) {
-        // forbid new tasks (can be called many times)
-        self.0.close();
-        self.0.wait().await;
+        let jset = {
+            let mut guard = self.0.lock().unwrap();
+            match &mut *guard {
+                BackgroundPurgesInner::Open(jset) => {
+                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
+                        std::mem::take(jset),
+                    )))
+                }
+                BackgroundPurgesInner::ShuttingDown(_) => {
+                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
+                    warn!("already shutting down");
+                }
+            };
+            match &mut *guard {
+                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
+                BackgroundPurgesInner::Open(_) => {
+                    unreachable!("above code transitions into shut down state");
+                }
+            }
+        };
+        let mut jset = jset.lock().await; // concurrent callers coalesce here
+        while let Some(res) = jset.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(e) if e.is_panic() => {
+                    // If it panicked, the error is already logged by the panic hook.
+                }
+                Err(e) if e.is_cancelled() => {
+                    unreachable!("we don't cancel the joinset or runtime")
+                }
+                Err(e) => {
+                    // No idea when this can happen, but let's log it.
+                    warn!(%e, "background purge task failed or panicked");
+                }
+            }
+        }
    }
 }

@@ -627,20 +667,18 @@ pub async fn init_tenant_mgr(
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
-                tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                )
-                .expect("global shutdown during init_tenant_mgr cannot happen"),
-            ),
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                None,
+                &ctx,
+            )),
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -687,8 +725,9 @@ fn tenant_spawn(
    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
+    existing_detach_attempt: Option<&detach_ancestor::Attempt>,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Arc<Tenant> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -707,6 +746,7 @@ fn tenant_spawn(
        shard_identity,
        init_order,
        mode,
+        existing_detach_attempt,
        ctx,
    )
 }
@@ -1154,11 +1194,9 @@ impl TenantManager {
                    shard_identity,
                    None,
                    spawn_mode,
+                    None,
                    ctx,
-                )
-                .map_err(|_: GlobalShutDown| {
-                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
-                })?;
+                );

                TenantSlot::Attached(tenant)
            }
@@ -1278,8 +1316,9 @@ impl TenantManager {
            shard_identity,
            None,
            SpawnMode::Eager,
+            None,
            ctx,
-        )?;
+        );

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -1350,32 +1389,34 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
    ) -> Result<(), DeleteTenantError> {
        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            Some(&remote_path),
-            remote_storage::ListingMode::NoDelimiter,
-            None,
-            &self.cancel,
-        );
-        while let Some(chunk) = keys_stream.next().await {
-            let keys = match chunk {
-                Ok(listing) => listing.keys,
-                Err(remote_storage::DownloadError::Cancelled) => {
-                    return Err(DeleteTenantError::Cancelled)
-                }
-                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            };
-
-            if keys.is_empty() {
-                tracing::info!("Remote storage already deleted");
-            } else {
-                tracing::info!("Deleting {} keys from remote storage", keys.len());
-                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-                self.resources
-                    .remote_storage
-                    .delete_objects(&keys, &self.cancel)
-                    .await?;
+        let keys = match self
+            .resources
+            .remote_storage
+            .list(
+                Some(&remote_path),
+                remote_storage::ListingMode::NoDelimiter,
+                None,
+                &self.cancel,
+            )
+            .await
+        {
+            Ok(listing) => listing.keys,
+            Err(remote_storage::DownloadError::Cancelled) => {
+                return Err(DeleteTenantError::Cancelled)
            }
+            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+        };
+
+        if keys.is_empty() {
+            tracing::info!("Remote storage already deleted");
+        } else {
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+            self.resources
+                .remote_storage
+                .delete_objects(&keys, &self.cancel)
+                .await?;
        }

        Ok(())
@@ -1729,9 +1770,14 @@ impl TenantManager {
            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
            for timeline in timelines.values() {
                tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let layers = timeline.layers.read().await;
+                let timeline_layers = timeline
+                    .layers
+                    .read()
+                    .await
+                    .likely_resident_layers()
+                    .collect::<Vec<_>>();

-                for layer in layers.likely_resident_layers() {
+                for layer in timeline_layers {
                    let relative_path = layer
                        .local_path()
                        .strip_prefix(&parent_path)
@@ -1927,6 +1973,7 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        prepared: PreparedTimelineDetach,
+        mut attempt: detach_ancestor::Attempt,
        ctx: &RequestContext,
    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
        // FIXME: this is unnecessary, slotguard already has these semantics
@@ -1977,43 +2024,66 @@ impl TenantManager {

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let reparented = timeline
-            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
+        let resp = timeline
+            .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
            .await?;

        let mut slot_guard = slot_guard.into_inner();

-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
-            }
-            Err(_barrier) => {
-                slot_guard.revert();
-                // this really should not happen, at all, unless shutdown was already going?
-                anyhow::bail!("Cannot restart Tenant, already shutting down");
+        let tenant = if resp.reset_tenant_required() {
+            attempt.before_shutdown();
+
+            let (_guard, progress) = utils::completion::channel();
+            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                Ok(()) => {
+                    slot_guard.drop_old_value()?;
+                }
+                Err(_barrier) => {
+                    slot_guard.revert();
+                    // this really should not happen, at all, unless shutdown was already going?
+                    anyhow::bail!("Cannot restart Tenant, already shutting down");
+                }
            }
+
+            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+            let shard_identity = config.shard;
+            let tenant = tenant_spawn(
+                self.conf,
+                tenant_shard_id,
+                &tenant_path,
+                self.resources.clone(),
+                AttachedTenantConf::try_from(config)?,
+                shard_identity,
+                None,
+                SpawnMode::Eager,
+                Some(&attempt),
+                ctx,
+            );
+
+            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
+            tenant
+        } else {
+            tracing::info!("skipping tenant_reset as no changes made required it");
+            tenant
+        };
+
+        if let Some(reparented) = resp.completed() {
+            // finally ask the restarted tenant to complete the detach
+            tenant
+                .ongoing_timeline_detach
+                .complete(attempt, &tenant)
+                .await?;
+            Ok(reparented)
+        } else {
+            // at least the latest versions have now been downloaded and refreshed; be ready to
+            // retry another time.
+            tenant.ongoing_timeline_detach.cancel(attempt);
+            Err(anyhow::anyhow!(
+                "failed to reparent all candidate timelines, please retry"
+            ))
        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            SpawnMode::Eager,
-            ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(reparented)
    }

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
@@ -2050,6 +2120,7 @@ impl TenantManager {
                    };

                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return ShardResolveResult::Found(tenant.clone())
                        }
@@ -2131,9 +2202,6 @@ pub(crate) enum GetActiveTenantError {
    /// never happen.
    #[error("Tenant is broken: {0}")]
    Broken(String),
-
-    #[error("reconnect to switch tenant id")]
-    SwitchedTenant,
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -187,7 +187,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};

 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -457,17 +457,6 @@ impl RemoteTimelineClient {
            .unwrap_or(false)
    }

-    /// Returns whether the timeline is archived.
-    /// Return None if the remote index_part hasn't been downloaded yet.
-    pub(crate) fn is_archived(&self) -> Option<bool> {
-        self.upload_queue
-            .lock()
-            .unwrap()
-            .initialized_mut()
-            .map(|q| q.clean.0.archived_at.is_some())
-            .ok()
-    }
-
    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
            current_remote_index_part
@@ -628,7 +617,7 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
+    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
        self: &Arc<Self>,
        last_aux_file_policy: Option<AuxFilePolicy>,
@@ -639,48 +628,6 @@ impl RemoteTimelineClient {
        self.schedule_index_upload(upload_queue)?;
        Ok(())
    }
-
-    /// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
-    ///
-    /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
-    /// so either if the change is already sitting in the queue, but not commited yet, or the change has not
-    /// been in the queue yet.
-    pub(crate) fn schedule_index_upload_for_timeline_archival_state(
-        self: &Arc<Self>,
-        state: TimelineArchivalState,
-    ) -> anyhow::Result<bool> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        /// Returns Some(_) if a change is needed, and Some(true) if it's a
-        /// change needed to set archived_at.
-        fn need_change(
-            archived_at: &Option<NaiveDateTime>,
-            state: TimelineArchivalState,
-        ) -> Option<bool> {
-            match (archived_at, state) {
-                (Some(_), TimelineArchivalState::Archived)
-                | (None, TimelineArchivalState::Unarchived) => {
-                    // Nothing to do
-                    tracing::info!("intended state matches present state");
-                    None
-                }
-                (None, TimelineArchivalState::Archived) => Some(true),
-                (Some(_), TimelineArchivalState::Unarchived) => Some(false),
-            }
-        }
-        let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state);
-
-        if let Some(archived_at_set) = need_upload_scheduled {
-            let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc());
-            upload_queue.dirty.archived_at = intended_archived_at;
-            self.schedule_index_upload(upload_queue)?;
-        }
-
-        let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some();
-        Ok(need_wait)
-    }
-
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -736,12 +683,13 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Reparent this timeline to a new parent.
+    ///
+    /// A retryable step of timeline ancestor detach.
    pub(crate) async fn schedule_reparenting_and_wait(
        self: &Arc<Self>,
        new_parent: &TimelineId,
    ) -> anyhow::Result<()> {
-        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
-        // and reads the in-memory part we cannot do the detaching like this
        let receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
@@ -752,17 +700,29 @@ impl RemoteTimelineClient {
                ));
            };

-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            let uploaded = &upload_queue.clean.0.metadata;

-            self.schedule_index_upload(upload_queue)?;
+            if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
+                // nothing to do
+                None
+            } else {
+                let mut modified = false;

-            self.schedule_barrier0(upload_queue)
+                modified |= upload_queue.dirty.metadata.reparent(new_parent);
+                modified |= upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+
+                if modified {
+                    self.schedule_index_upload(upload_queue)?;
+                }
+
+                Some(self.schedule_barrier0(upload_queue))
+            }
        };

-        Self::wait_completion0(receiver)
-            .await
-            .context("wait completion")
+        if let Some(receiver) = receiver {
+            Self::wait_completion0(receiver).await?;
+        }
+        Ok(())
    }

    /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -778,115 +738,99 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
+                None
+            } else {
+                let mut modified = false;
+                modified |= upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+                modified |= upload_queue.dirty.lineage.record_detaching(&adopted);

-            for layer in layers {
-                upload_queue
-                    .dirty
-                    .layer_metadata
-                    .insert(layer.layer_desc().layer_name(), layer.metadata());
+                for layer in layers {
+                    let prev = upload_queue
+                        .dirty
+                        .layer_metadata
+                        .insert(layer.layer_desc().layer_name(), layer.metadata());
+                    modified |= prev.is_none();
+                }
+
+                if modified {
+                    self.schedule_index_upload(upload_queue)?;
+                }
+
+                Some(self.schedule_barrier0(upload_queue))
            }
-
-            self.schedule_index_upload(upload_queue)?;
-
-            let barrier = self.schedule_barrier0(upload_queue);
-            self.launch_queued_tasks(upload_queue);
-            barrier
        };

-        Self::wait_completion0(barrier)
-            .await
-            .context("wait completion")
+        if let Some(barrier) = barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
    }

-    /// Adds a gc blocking reason for this timeline if one does not exist already.
+    /// Marks timeline detach ancestor started for this timeline if it has not been marked as
+    /// started.
    ///
-    /// A retryable step of timeline detach ancestor.
+    /// A retryable step o ftimeline detach ancestor.
    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_insert_gc_block_reason(
+    /// Does not overwrite or even error if the set of reparentable timelines differes. Those can
+    /// be inspected later.
+    ///
+    /// Waits until the completion of the upload.
+    pub(crate) async fn schedule_started_detach_ancestor_mark_and_wait(
        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
+    ) -> anyhow::Result<()> {
        let maybe_barrier = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
-                    drop(guard);
-                    panic!("cannot start detach ancestor if there is nothing to detach from");
-                }
+            fn wanted(x: Option<&index::GcBlocking>) -> bool {
+                x.is_some_and(|b| b.blocked_by_detach_ancestor())
            }

-            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
-
            let current = upload_queue.dirty.gc_blocking.as_ref();
            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();

            match (current, uploaded) {
                (x, y) if wanted(x) && wanted(y) => None,
                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                // Usual case: !wanted(x) && !wanted(y)
-                //
-                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
-                // turn on and off some reason.
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        // this could be avoided by having external in-memory synchronization, like
-                        // timeline detach ancestor
-                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
-                    }
-
+                _ => {
                    // at this point, the metadata must always show that there is a parent
+                    if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
+                        panic!("cannot start detach ancestor if there is nothing to detach from");
+                    }
                    upload_queue.dirty.gc_blocking = current
-                        .map(|x| x.with_reason(reason))
-                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
+                        .map(|x| x.with_detach_ancestor())
+                        .or_else(|| Some(index::GcBlocking::started_now_for_detach_ancestor()));
                    self.schedule_index_upload(upload_queue)?;
                    Some(self.schedule_barrier0(upload_queue))
                }
            }
        };

-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
+        if let Some(barrier) = maybe_barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
    }

-    /// Removes a gc blocking reason for this timeline if one exists.
+    /// Marks timeline detach ancestor completed for this timeline if it has not been marked as
+    /// such already.
    ///
-    /// A retryable step of timeline detach ancestor.
+    /// ## Panics
    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_remove_gc_block_reason(
+    /// If the timeline has not been detached from ancestor already.
+    pub(crate) async fn schedule_completed_detach_ancestor_mark_and_wait(
        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
+    ) -> anyhow::Result<()> {
        let maybe_barrier = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue
-                    .clean
-                    .0
-                    .lineage
-                    .is_detached_from_original_ancestor()
-                {
-                    drop(guard);
-                    panic!("cannot complete timeline_ancestor_detach while not detached");
-                }
-            }
+            assert!(upload_queue.clean.0.lineage.is_detached_from_ancestor());

-            let wanted = |x: Option<&index::GcBlocking>| {
-                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
-            };
+            fn wanted(x: Option<&index::GcBlocking>) -> bool {
+                x.is_none() || x.is_some_and(|b| !b.blocked_by_detach_ancestor())
+            }

            let current = upload_queue.dirty.gc_blocking.as_ref();
            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
@@ -894,27 +838,21 @@ impl RemoteTimelineClient {
            match (current, uploaded) {
                (x, y) if wanted(x) && wanted(y) => None,
                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
-                    }
-
-                    upload_queue.dirty.gc_blocking =
-                        current.as_ref().and_then(|x| x.without_reason(reason));
+                _ => {
+                    upload_queue.dirty.gc_blocking = current
+                        .expect("has to be Some because of wanted()")
+                        .without_detach_ancestor();
                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
-                    // FIXME: bogus ?
                    self.schedule_index_upload(upload_queue)?;
                    Some(self.schedule_barrier0(upload_queue))
                }
            }
        };

-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
+        if let Some(barrier) = maybe_barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
    }

    /// Launch an upload operation in the background; the file is added to be included in next
@@ -1495,18 +1433,6 @@ impl RemoteTimelineClient {
                .dirty
                .layer_metadata
                .drain()
-                .filter(|(_file_name, meta)| {
-                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
-                    // all shards anyway, we _could_ delete these, but
-                    // - it creates a potential race if other shards are still
-                    //   using the layers while this shard deletes them.
-                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
-                    //   these timelines are present but corrupt (their index exists but some layers don't)
-                    //
-                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
-                    meta.shard.shard_number == self.tenant_shard_id.shard_number
-                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
-                })
                .map(|(file_name, meta)| {
                    remote_layer_path(
                        &self.tenant_shard_id.tenant_id,
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -32,10 +32,6 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub archived_at: Option<NaiveDateTime>,
-
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -87,12 +83,11 @@ impl IndexPart {
    /// - 5: lineage was added
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
-    /// - 8: added `archived_at`
-    /// - 9: +gc_blocking
-    const LATEST_VERSION: usize = 9;
+    /// - 8: +gc_blocking
+    const LATEST_VERSION: usize = 8;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -103,7 +98,6 @@ impl IndexPart {
            disk_consistent_lsn: metadata.disk_consistent_lsn(),
            metadata,
            deleted_at: None,
-            archived_at: None,
            lineage: Default::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
@@ -216,26 +210,45 @@ fn is_false(b: &bool) -> bool {
 impl Lineage {
    const REMEMBER_AT_MOST: usize = 100;

-    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
        if self.reparenting_history.last() == Some(old_ancestor) {
            // do not re-record it
-            return;
-        }
+            false
+        } else {
+            #[cfg(feature = "testing")]
+            {
+                let existing = self
+                    .reparenting_history
+                    .iter()
+                    .position(|x| x == old_ancestor);
+                assert_eq!(
+                    existing, None,
+                    "we cannot reparent onto and off and onto the same timeline twice"
+                );
+            }
+            let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;

-        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
-
-        self.reparenting_history_truncated |= drop_oldest;
-        if drop_oldest {
-            self.reparenting_history.remove(0);
+            self.reparenting_history_truncated |= drop_oldest;
+            if drop_oldest {
+                self.reparenting_history.remove(0);
+            }
+            self.reparenting_history.push(*old_ancestor);
+            true
        }
-        self.reparenting_history.push(*old_ancestor);
    }

-    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
-        assert!(self.original_ancestor.is_none());
-
-        self.original_ancestor =
-            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    /// Returns true if anything changed.
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
+        if let Some((id, lsn, _)) = self.original_ancestor {
+            assert_eq!(id, branchpoint.0);
+            assert_eq!(lsn, branchpoint.1);
+            false
+        } else {
+            assert!(self.original_ancestor.is_none());
+            self.original_ancestor =
+                Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+            true
+        }
    }

    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
@@ -247,70 +260,50 @@ impl Lineage {
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }

-    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+    /// Returns true if the timeline originally had an ancestor, and no longer has one.
+    pub(crate) fn is_detached_from_ancestor(&self) -> bool {
        self.original_ancestor.is_some()
    }

+    /// Returns original ancestor timeline id and lsn that this timeline has been detached from.
+    pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
+        self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
+    }
+
    pub(crate) fn is_reparented(&self) -> bool {
        !self.reparenting_history.is_empty()
    }
 }

+/// Right now, the only reason to block gc persistently is detach_ancestor. To use gc blocking more
+/// broadly, a reason set field needs to be added, and the shared state load time building be
+/// complicated to avoid detach_ancestor clearing out a manually configured gc blocking.
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub(crate) struct GcBlocking {
    pub(crate) started_at: NaiveDateTime,
-    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
-#[enumset(serialize_repr = "list")]
-pub(crate) enum GcBlockingReason {
-    Manual,
-    DetachAncestor,
 }

 impl GcBlocking {
-    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
+    pub(super) fn started_now_for_detach_ancestor() -> Self {
        GcBlocking {
            started_at: chrono::Utc::now().naive_utc(),
-            reasons: enumset::EnumSet::only(reason),
        }
    }

-    /// Returns true if the given reason is one of the reasons why the gc is blocked.
-    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
-        self.reasons.contains(reason)
+    /// Returns true if detach_ancestor is one of the reasons why the gc is blocked.
+    pub(crate) fn blocked_by_detach_ancestor(&self) -> bool {
+        true
    }

-    /// Returns a version of self with the given reason.
-    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
-        assert!(!self.blocked_by(reason));
-        let mut reasons = self.reasons;
-        reasons.insert(reason);
-
-        Self {
-            started_at: self.started_at,
-            reasons,
-        }
+    /// Returns a version of self with the reason of detach_ancestor.
+    pub(super) fn with_detach_ancestor(&self) -> Self {
+        self.clone()
    }

-    /// Returns a version of self without the given reason. Assumption is that if
-    /// there are no more reasons, we can unblock the gc by returning `None`.
-    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
-        assert!(self.blocked_by(reason));
-
-        if self.reasons.len() == 1 {
-            None
-        } else {
-            let mut reasons = self.reasons;
-            assert!(reasons.remove(reason));
-            assert!(!reasons.is_empty());
-
-            Some(Self {
-                started_at: self.started_at,
-                reasons,
-            })
-        }
+    /// Returns a version of self without the reason of detach_ancestor. Assumption is that if
+    /// there are no more reasons, we can unblock the gc.
+    pub(super) fn without_detach_ancestor(&self) -> Option<Self> {
+        None
    }
 }

@@ -353,7 +346,6 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
@@ -397,7 +389,6 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
@@ -442,7 +433,6 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
@@ -490,7 +480,6 @@ mod tests {
            ])
            .unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
@@ -533,7 +522,6 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
@@ -575,7 +563,6 @@ mod tests {
            disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
            metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
-            archived_at: None,
            lineage: Lineage {
                reparenting_history_truncated: false,
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -626,7 +613,6 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Lineage {
                reparenting_history_truncated: false,
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -686,7 +672,6 @@ mod tests {
                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: None,
            lineage: Default::default(),
            gc_blocking: None,
            last_aux_file_policy: Default::default(),
@@ -714,8 +699,9 @@ mod tests {
                "initdb_lsn": "0/1696070",
                "pg_version": 14
            },
-            "deleted_at": "2023-07-31T09:00:00.123",
-            "archived_at": "2023-04-29T09:00:00.123"
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123"
+            }
        }"#;

        let expected = IndexPart {
@@ -742,73 +728,12 @@ mod tests {
                Lsn::from_str("0/1696070").unwrap(),
                14,
            ).with_recalculated_checksum().unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
-            lineage: Default::default(),
-            gc_blocking: None,
-            last_aux_file_policy: Default::default(),
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
-    #[test]
-    fn v9_indexpart_is_parsed() {
-        let example = r#"{
-            "version": 9,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata": {
-                "disk_consistent_lsn": "0/16960E8",
-                "prev_record_lsn": "0/1696070",
-                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/1696070",
-                "initdb_lsn": "0/1696070",
-                "pg_version": 14
-            },
-            "gc_blocking": {
-                "started_at": "2024-07-19T09:00:00.123",
-                "reasons": ["DetachAncestor"]
-            }
-        }"#;
-
-        let expected = IndexPart {
-            version: 9,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::new(
-                Lsn::from_str("0/16960E8").unwrap(),
-                Some(Lsn::from_str("0/1696070").unwrap()),
-                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
-                Lsn::INVALID,
-                Lsn::from_str("0/1696070").unwrap(),
-                Lsn::from_str("0/1696070").unwrap(),
-                14,
-            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
            gc_blocking: Some(GcBlocking {
                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
-                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
            }),
            last_aux_file_policy: Default::default(),
-            archived_at: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,9 +8,6 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-#[cfg(test)]
-pub mod split_writer;
-
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
@@ -435,18 +432,39 @@ impl ReadableLayer {
    }
 }

+/// Return value from [`Layer::get_value_reconstruct_data`]
+#[derive(Clone, Copy, Debug)]
+pub enum ValueReconstructResult {
+    /// Got all the data needed to reconstruct the requested page
+    Complete,
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue,
+
+    /// This layer didn't contain data needed to reconstruct the page version at
+    /// the returned LSN. This is usually considered an error, but might be OK
+    /// in some circumstances.
+    Missing,
+}
+
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum LayerVisibilityHint {
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
+pub(crate) enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
    /// and a readable LSN (the tip of the branch or a child's branch point)
    Visible,
    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
+    #[allow(unused)]
    Covered,
+    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
+    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
+    /// state is for when existing layers are constructed while loading a timeline.
+    #[default]
+    Uninitialized,
 }

 pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -539,25 +557,19 @@ impl LayerAccessStats {
        self.record_residence_event_at(SystemTime::now())
    }

-    fn record_access_at(&self, now: SystemTime) -> bool {
+    pub(crate) fn record_access_at(&self, now: SystemTime) {
        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);

        // A layer which is accessed must be visible.
        mask |= 0x1 << Self::VISIBILITY_SHIFT;
        value |= 0x1 << Self::VISIBILITY_SHIFT;

-        let old_bits = self.write_bits(mask, value);
-        !matches!(
-            self.decode_visibility(old_bits),
-            LayerVisibilityHint::Visible
-        )
+        self.write_bits(mask, value);
    }

-    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
-    /// as a result of this access
-    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
+    pub(crate) fn record_access(&self, ctx: &RequestContext) {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return false;
+            return;
        }

        self.record_access_at(SystemTime::now())
@@ -614,29 +626,22 @@ impl LayerAccessStats {
        }
    }

-    /// Helper for extracting the visibility hint from the literal value of our inner u64
-    fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
-        match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
-            1 => LayerVisibilityHint::Visible,
-            0 => LayerVisibilityHint::Covered,
-            _ => unreachable!(),
-        }
-    }
-
-    /// Returns the old value which has been replaced
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
        let value = match visibility {
            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
-            LayerVisibilityHint::Covered => 0x0,
+            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
        };

-        let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
-        self.decode_visibility(old_bits)
+        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
    }

    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-        self.decode_visibility(read)
+        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+            1 => LayerVisibilityHint::Visible,
+            0 => LayerVisibilityHint::Covered,
+            _ => unreachable!(),
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,12 +36,13 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -71,7 +72,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};

 ///
 /// Header stored in the beginning of the file
@@ -196,6 +200,7 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
+    access_stats: LayerAccessStats,
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -294,6 +299,7 @@ impl DeltaLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
+        self.access_stats.record_access(ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -301,10 +307,12 @@ impl DeltaLayer {
            .with_context(|| format!("Failed to load delta layer {}", self.path()))
    }

-    async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result<Arc<DeltaLayerInner>> {
+    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?;
+        let loaded = DeltaLayerInner::load(&path, None, None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -344,6 +352,7 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -366,6 +375,7 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
+    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
@@ -376,9 +386,6 @@ struct DeltaLayerWriterInner {
    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

    blob_writer: BlobWriter<true>,
-
-    // Number of key-lsns in the layer.
-    num_keys: usize,
 }

 impl DeltaLayerWriterInner {
@@ -412,6 +419,7 @@ impl DeltaLayerWriterInner {
        let tree_builder = DiskBtreeBuilder::new(block_buf);

        Ok(Self {
+            conf,
            path,
            timeline_id,
            tenant_shard_id,
@@ -419,7 +427,6 @@ impl DeltaLayerWriterInner {
            lsn_range,
            tree: tree_builder,
            blob_writer,
-            num_keys: 0,
        })
    }

@@ -462,7 +469,7 @@ impl DeltaLayerWriterInner {
            .write_blob_maybe_compressed(val, ctx, compression)
            .await;
        let off = match res {
-            Ok((off, _)) => off,
+            Ok(off) => off,
            Err(e) => return (val, Err(anyhow::anyhow!(e))),
        };

@@ -470,9 +477,6 @@ impl DeltaLayerWriterInner {

        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
        let res = self.tree.append(&delta_key.0, blob_ref.0);
-
-        self.num_keys += 1;
-
        (val, res.map_err(|e| anyhow::anyhow!(e)))
    }

@@ -486,10 +490,11 @@ impl DeltaLayerWriterInner {
    async fn finish(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, ctx).await;
+        let result = self.finish0(key_end, timeline, ctx).await;
        if result.is_err() {
            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
            if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -502,8 +507,9 @@ impl DeltaLayerWriterInner {
    async fn finish0(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -568,9 +574,11 @@ impl DeltaLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        trace!("created delta layer {}", self.path);
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        Ok((desc, self.path))
+        trace!("created delta layer {}", layer.local_path());
+
+        Ok(layer)
    }
 }

@@ -671,20 +679,14 @@ impl DeltaLayerWriter {
    pub(crate) async fn finish(
        mut self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(key_end, ctx).await
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
-    #[cfg(test)]
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(key_end, timeline, ctx)
+            .await
    }
 }

@@ -758,24 +760,27 @@ impl DeltaLayerInner {
        &self.layer_lsn_range
    }

+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        summary: Option<Summary>,
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
-            .await
-            .context("open layer file")?;
-
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path, ctx).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file_id = page_cache::next_file_id();

        let block_reader = FileBlockReader::new(&file, file_id);

-        let summary_blk = block_reader
-            .read_blk(0, ctx)
-            .await
-            .context("read first block")?;
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };

        // TODO: this should be an assertion instead; see ImageLayerInner::load
        let actual_summary =
@@ -797,7 +802,7 @@ impl DeltaLayerInner {
            }
        }

-        Ok(DeltaLayerInner {
+        Ok(Ok(DeltaLayerInner {
            file,
            file_id,
            index_start_blk: actual_summary.index_start_blk,
@@ -805,7 +810,96 @@ impl DeltaLayerInner {
            max_vectored_read_bytes,
            layer_key_range: actual_summary.key_range,
            layer_lsn_range: actual_summary.lsn_range,
-        })
+        }))
+    }
+
+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let mut need_image = true;
+        // Scan the page versions backwards, starting from `lsn`.
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            &block_reader,
+        );
+        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+        tree_reader
+            .visit(
+                &search_key.0,
+                VisitDirection::Backwards,
+                |key, value| {
+                    let blob_ref = BlobRef(value);
+                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                        return false;
+                    }
+                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                    if entry_lsn < lsn_range.start {
+                        return false;
+                    }
+                    offsets.push((entry_lsn, blob_ref.pos()));
+
+                    !blob_ref.will_init()
+                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
+            )
+            .await?;
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        // Ok, 'offsets' now contains the offsets of all the entries we need to read
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        for (entry_lsn, pos) in offsets {
+            cursor
+                .read_blob_into_buf(pos, &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            match val {
+                Value::Image(img) => {
+                    reconstruct_state.img = Some((entry_lsn, img));
+                    need_image = false;
+                    break;
+                }
+                Value::WalRecord(rec) => {
+                    let will_init = rec.will_init();
+                    reconstruct_state.records.push((entry_lsn, rec));
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_image = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
    }

    // Look up the keys in the provided keyspace and update
@@ -1580,9 +1674,8 @@ pub(crate) mod test {
    use super::*;
    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
-    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::Tenant;
    use crate::{
        context::DownloadBehavior,
        task_mgr::TaskKind,
@@ -1876,8 +1969,9 @@ pub(crate) mod test {
            res?;
        }

-        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
-        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
+        let resident = writer
+            .finish(entries_meta.key_range.end, &timeline, &ctx)
+            .await?;

        let inner = resident.get_as_delta(&ctx).await?;

@@ -1957,7 +2051,6 @@ pub(crate) mod test {
            .await
            .likely_resident_layers()
            .next()
-            .cloned()
            .unwrap();

        {
@@ -2032,8 +2125,7 @@ pub(crate) mod test {
            .read()
            .await
            .likely_resident_layers()
-            .find(|&x| x != &initdb_layer)
-            .cloned()
+            .find(|x| x != &initdb_layer)
            .unwrap();

        // create a copy for the timeline, so we don't overwrite the file
@@ -2068,8 +2160,7 @@ pub(crate) mod test {
                .await
                .unwrap();

-            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
-            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
+            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();

            copied_layer.get_as_delta(ctx).await.unwrap();

@@ -2197,9 +2288,7 @@ pub(crate) mod test {
        for (key, lsn, value) in deltas {
            writer.put_value(key, lsn, value, ctx).await?;
        }
-
-        let (desc, path) = writer.finish(key_end, ctx).await?;
-        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
+        let delta_layer = writer.finish(key_end, tline, ctx).await?;

        Ok::<_, anyhow::Error>(delta_layer)
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,6 +32,9 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -134,6 +137,7 @@ pub struct ImageLayer {
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+    access_stats: LayerAccessStats,
    inner: OnceCell<ImageLayerInner>,
 }

@@ -251,6 +255,7 @@ impl ImageLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -260,8 +265,9 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -301,6 +307,7 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -378,16 +385,17 @@ impl ImageLayerInner {
        summary: Option<Summary>,
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
-            .await
-            .context("open layer file")?;
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path, ctx).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file_id = page_cache::next_file_id();
        let block_reader = FileBlockReader::new(&file, file_id);
-        let summary_blk = block_reader
-            .read_blk(0, ctx)
-            .await
-            .context("read first block")?;
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };

        // length is the only way how this could fail, so it's not actually likely at all unless
        // read_blk returns wrong sized block.
@@ -412,7 +420,7 @@ impl ImageLayerInner {
            }
        }

-        Ok(ImageLayerInner {
+        Ok(Ok(ImageLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            lsn,
@@ -420,7 +428,47 @@ impl ImageLayerInner {
            file_id,
            max_vectored_read_bytes,
            key_range: actual_summary.key_range,
-        })
+        }))
+    }
+
+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader
+            .get(
+                &keybuf,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                    .build(),
+            )
+            .await?
+        {
+            let blob = block_reader
+                .block_cursor()
+                .read_blob(
+                    offset,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerValue)
+                        .build(),
+                )
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
    }

    // Look up the keys in the provided keyspace and update
@@ -688,29 +736,11 @@ struct ImageLayerWriterInner {
    // Total uncompressed bytes passed into put_image
    uncompressed_bytes: u64,

-    // Like `uncompressed_bytes`,
-    // but only of images we might consider for compression
-    uncompressed_bytes_eligible: u64,
-
-    // Like `uncompressed_bytes`, but only of images
-    // where we have chosen their compressed form
-    uncompressed_bytes_chosen: u64,
-
-    // Number of keys in the layer.
-    num_keys: usize,
-
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
-
-    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
-    last_written_key: Key,
 }

 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
    ///
    /// Start building a new image layer.
    ///
@@ -762,10 +792,6 @@ impl ImageLayerWriterInner {
            tree: tree_builder,
            blob_writer,
            uncompressed_bytes: 0,
-            uncompressed_bytes_eligible: 0,
-            uncompressed_bytes_chosen: 0,
-            num_keys: 0,
-            last_written_key: Key::MIN,
        };

        Ok(writer)
@@ -784,33 +810,18 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let compression = self.conf.image_compression;
-        let uncompressed_len = img.len() as u64;
-        self.uncompressed_bytes += uncompressed_len;
-        self.num_keys += 1;
+        self.uncompressed_bytes += img.len() as u64;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
            .await;
        // TODO: re-use the buffer for `img` further upstack
-        let (off, compression_info) = res?;
-        if compression_info.compressed_size.is_some() {
-            // The image has been considered for compression at least
-            self.uncompressed_bytes_eligible += uncompressed_len;
-        }
-        if compression_info.written_compressed {
-            // The image has been compressed
-            self.uncompressed_bytes_chosen += uncompressed_len;
-        }
+        let off = res?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
        self.tree.append(&keybuf, off)?;

-        #[cfg(feature = "testing")]
-        {
-            self.last_written_key = key;
-        }
-
        Ok(())
    }

@@ -821,7 +832,6 @@ impl ImageLayerWriterInner {
        self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        end_key: Option<Key>,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -829,9 +839,6 @@ impl ImageLayerWriterInner {
        // Calculate compression ratio
        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
-            .inc_by(self.uncompressed_bytes_eligible);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);

        let mut file = self.blob_writer.into_inner();
@@ -872,23 +879,11 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            self.key_range.clone(),
            self.lsn,
            metadata.len(),
        );

-        #[cfg(feature = "testing")]
-        if let Some(end_key) = end_key {
-            assert!(
-                self.last_written_key < end_key,
-                "written key violates end_key range"
-            );
-        }
-
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -965,18 +960,6 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

-    #[cfg(test)]
-    /// Estimated size of the image layer.
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
    ///
    /// Finish writing the image layer.
    ///
@@ -985,26 +968,7 @@ impl ImageLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
-    }
-
-    #[cfg(test)]
-    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
-    pub(super) async fn finish_with_end_key(
-        mut self,
-        timeline: &Arc<Timeline>,
-        end_key: Key,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
-    }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
+        self.inner.take().unwrap().finish(timeline, ctx).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
+use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, Result};
-use camino::Utf8PathBuf;
+use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -34,7 +34,8 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
 };

 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
@@ -54,6 +55,9 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    pub(crate) end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
    frozen_local_path_str: OnceLock<Arc<str>>,

@@ -244,6 +248,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -293,6 +303,60 @@ impl InMemoryLayer {
        Ok(())
    }

+    /// Look up given value in the layer.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.start_lsn);
+        let mut need_image = true;
+
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+
+        let reader = inner.file.block_cursor();
+
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, pos) in slice.iter().rev() {
+                let buf = reader.read_blob(*pos, &ctx).await?;
+                let value = Value::des(&buf)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        // release lock on 'inner'
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -385,17 +449,20 @@ impl InMemoryLayer {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
@@ -415,7 +482,8 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+
+    pub(crate) async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
@@ -480,6 +548,8 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;
+
        assert!(
            self.start_lsn < end_lsn,
            "{} >= {}",
@@ -497,13 +567,9 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        #[cfg(debug_assertions)]
-        {
-            let inner = self.inner.write().await;
-            for vec_map in inner.index.values() {
-                for (lsn, _pos) in vec_map.as_slice() {
-                    assert!(*lsn < end_lsn);
-                }
+        for vec_map in inner.index.values() {
+            for (lsn, _pos) in vec_map.as_slice() {
+                assert!(*lsn < end_lsn);
            }
        }
    }
@@ -513,12 +579,12 @@ impl InMemoryLayer {
    /// if there are no matching keys.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub async fn write_to_disk(
+    pub(crate) async fn write_to_disk(
        &self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
-        l0_flush_global_state: &l0_flush::Inner,
-    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
+    ) -> Result<Option<ResidentLayer>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -530,8 +596,9 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;

+        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
        use l0_flush::Inner;
-        let _concurrency_permit = match l0_flush_global_state {
+        let _concurrency_permit = match &*l0_flush_global_state {
            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };
@@ -561,7 +628,7 @@ impl InMemoryLayer {
        )
        .await?;

-        match l0_flush_global_state {
+        match &*l0_flush_global_state {
            l0_flush::Inner::PageCached => {
                let ctx = RequestContextBuilder::extend(ctx)
                    .page_content_kind(PageContentKind::InMemoryLayer)
@@ -626,7 +693,7 @@ impl InMemoryLayer {
        }

        // MAX is used here because we identify L0 layers by full key range
-        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;

        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
        //
@@ -638,6 +705,6 @@ impl InMemoryLayer {
        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
        drop(_concurrency_permit);

-        Ok(Some((desc, path)))
+        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -246,7 +246,7 @@ impl Layer {
                &timeline.generation,
            );

-            LayerInner::new(
+            let layer = LayerInner::new(
                conf,
                timeline,
                local_path,
@@ -254,7 +254,14 @@ impl Layer {
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
-            )
+            );
+
+            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
+            layer
+                .access_stats
+                .set_visibility(super::LayerVisibilityHint::Visible);
+
+            layer
        }));

        let downloaded = resident.expect("just initialized");
@@ -300,6 +307,42 @@ impl Layer {
        self.0.delete_on_drop();
    }

+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from the previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use anyhow::ensure;
+
+        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
+        self.0.access_stats.record_access(ctx);
+
+        if self.layer_desc().is_delta {
+            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
+            ensure!(self.layer_desc().key_range.contains(&key));
+        } else {
+            ensure!(self.layer_desc().key_range.contains(&key));
+            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
+            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
+        }
+
+        layer
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
+            .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
+    }
+
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -316,7 +359,7 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.record_access(ctx);
+        self.0.access_stats.record_access(ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,18 +439,18 @@ impl Layer {
        self.0.info(reset)
    }

-    pub(crate) fn latest_activity(&self) -> SystemTime {
-        self.0.access_stats.latest_activity()
-    }
-
-    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        self.0.access_stats.visibility()
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.0.access_stats
    }

    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.0.path
    }

+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -450,57 +493,13 @@ impl Layer {
            }
        }
    }
-
-    fn record_access(&self, ctx: &RequestContext) {
-        if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
-            if let Some(tl) = self.0.timeline.upgrade() {
-                tl.metrics
-                    .visible_physical_size_gauge
-                    .add(self.0.desc.file_size)
-            }
-        }
-    }
-
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
-        use LayerVisibilityHint::*;
-        match (old_visibility, visibility) {
-            (Visible, Covered) => {
-                // Subtract this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    debug_assert!(
-                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
-                    );
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .sub(self.0.desc.file_size)
-                }
-            }
-            (Covered, Visible) => {
-                // Add this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .add(self.0.desc.file_size)
-                }
-            }
-            (Covered, Covered) | (Visible, Visible) => {
-                // no change
-            }
-        }
-    }
 }

 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_values_reconstruct_data`].
+/// read with [`Layer::get_value_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -581,6 +580,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -691,16 +693,6 @@ impl Drop for LayerInner {
                timeline.metrics.layer_count_image.dec();
                timeline.metrics.layer_size_image.sub(self.desc.file_size);
            }
-
-            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
-                debug_assert!(
-                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
-                );
-                timeline
-                    .metrics
-                    .visible_physical_size_gauge
-                    .sub(self.desc.file_size);
-            }
        }

        if !*self.wanted_deleted.get_mut() {
@@ -809,14 +801,11 @@ impl LayerInner {
            timeline.metrics.layer_size_image.add(desc.file_size);
        }

-        // New layers are visible by default. This metric is later updated on drop or in set_visibility
-        timeline
-            .metrics
-            .visible_physical_size_gauge
-            .add(desc.file_size);
-
        LayerInner {
            conf,
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1662,9 +1651,8 @@ impl Drop for DownloadedLayer {
 }

 impl DownloadedLayer {
-    /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
-    /// Failure to load the layer is sticky, i.e., future `get()` calls will return
-    /// the initial load failure immediately.
+    /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
+    /// initialize it permanently.
    ///
    /// `owner` parameter is a strong reference at the same `LayerInner` as the
    /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
@@ -1695,7 +1683,7 @@ impl DownloadedLayer {
                    ctx,
                )
                .await
-                .map(LayerKind::Delta)
+                .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
@@ -1712,29 +1700,54 @@ impl DownloadedLayer {
                    ctx,
                )
                .await
-                .map(LayerKind::Image)
+                .map(|res| res.map(LayerKind::Image))
            };

            match res {
-                Ok(layer) => Ok(layer),
-                Err(err) => {
+                Ok(Ok(layer)) => Ok(Ok(layer)),
+                Ok(Err(transient)) => Err(transient),
+                Err(permanent) => {
                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
-                    // We log this message once over the lifetime of `Self`
-                    // => Ok and good to log backtrace and path here.
-                    tracing::error!(
-                        "layer load failed, assuming permanent failure: {}: {err:?}",
-                        owner.path
-                    );
-                    Err(err)
+                    // TODO(#5815): we are not logging all errors, so temporarily log them **once**
+                    // here as well
+                    let permanent = permanent.context("load layer");
+                    tracing::error!("layer loading failed permanently: {permanent:#}");
+                    Ok(Err(permanent))
                }
            }
        };
        self.kind
-            .get_or_init(init)
-            .await
+            .get_or_try_init(init)
+            // return transient errors using `?`
+            .await?
            .as_ref()
-            // We already logged the full backtrace above, once. Don't repeat that here.
-            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
+            .map_err(|e| {
+                // errors are not clonabled, cannot but stringify
+                // test_broken_timeline matches this string
+                anyhow::anyhow!("layer loading failed: {e:#}")
+            })
+    }
+
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => {
+                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
+                    .await
+            }
+        }
    }

    async fn get_values_reconstruct_data(
@@ -1747,11 +1760,7 @@ impl DownloadedLayer {
    ) -> Result<(), GetVectoredError> {
        use LayerKind::*;

-        match self
-            .get(owner, ctx)
-            .await
-            .map_err(GetVectoredError::Other)?
-        {
+        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
            Delta(d) => {
                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
                    .await
@@ -1835,7 +1844,7 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                self.owner.record_access(ctx);
+                owner.access_stats.record_access(ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -39,7 +39,7 @@ async fn smoke_test() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -50,26 +50,13 @@ async fn smoke_test() {
    // all layers created at pageserver are like `layer`, initialized with strong
    // Arc<DownloadedLayer>.

-    let controlfile_keyspace = KeySpace {
-        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
-    };
-
    let img_before = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
+        data.img
            .take()
            .expect("tenant harness writes the control file")
    };
@@ -87,24 +74,13 @@ async fn smoke_test() {

    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .instrument(download_span.clone())
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
-            .take()
-            .expect("tenant harness writes the control file")
+        data.img.take().unwrap()
    };

    assert_eq!(img_before, img_after);
@@ -176,7 +152,7 @@ async fn smoke_test() {
    {
        let layers = &[layer];
        let mut g = timeline.layers.write().await;
-        g.open_mut().unwrap().finish_gc_timeline(layers);
+        g.finish_gc_timeline(layers);
        // this just updates the remote_physical_size for demonstration purposes
        rtc.schedule_gc_update(layers).unwrap();
    }
@@ -216,7 +192,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -260,7 +236,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    // the deletion of the layer in remote_storage happens.
    {
        let mut layers = timeline.layers.write().await;
-        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
+        layers.finish_gc_timeline(&[layer]);
    }

    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -301,7 +277,7 @@ fn read_wins_pending_eviction() {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+                layers.likely_resident_layers().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -433,7 +409,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+                layers.likely_resident_layers().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -602,7 +578,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -682,7 +658,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -801,9 +777,9 @@ async fn eviction_cancellation_on_drop() {
    let (evicted_layer, not_evicted) = {
        let mut layers = {
            let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
+            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
            // remove the layers from layermap
-            guard.open_mut().unwrap().finish_gc_timeline(&layers);
+            guard.finish_gc_timeline(&layers);

            layers
        };
@@ -852,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
 #[test]
 #[cfg(target_arch = "x86_64")]
 fn layer_size() {
-    assert_eq!(size_of::<LayerAccessStats>(), 8);
-    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 296);
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
+    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 312);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -41,20 +41,6 @@ pub struct PersistentLayerKey {
    pub is_delta: bool,
 }

-impl std::fmt::Display for PersistentLayerKey {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}..{} {}..{} is_delta={}",
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta
-        )
-    }
-}
-
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -204,11 +204,9 @@ impl<'a> IteratorWrapper<'a> {
 /// A merge iterator over delta/image layer iterators. When duplicated records are
 /// found, the iterator will not perform any deduplication, and the caller should handle
 /// these situation. By saying duplicated records, there are many possibilities:
-///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
 /// * Delta/image at the same LSN where the image has already applied the delta.
-///
 /// The iterator will always put the image before the delta.
 pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,454 +0,0 @@
-use std::{ops::Range, sync::Arc};
-
-use bytes::Bytes;
-use pageserver_api::key::{Key, KEY_SIZE};
-use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
-
-use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
-
-use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
-
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
-#[must_use]
-pub struct SplitImageLayerWriter {
-    inner: ImageLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn: Lsn,
-}
-
-impl SplitImageLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn: Lsn,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: ImageLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                &(start_key..Key::MAX),
-                lsn,
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn,
-        })
-    }
-
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is an upper bound of the space that the key/image could take
-        // because we did not consider compression in this estimation. The resulting image layer
-        // could be smaller than the target size.
-        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_image_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                &(key..Key::MAX),
-                self.lsn,
-                ctx,
-            )
-            .await?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
-        }
-        self.inner.put_image(key, img, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
-#[must_use]
-pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn_range: Range<Lsn>,
-}
-
-impl SplitDeltaLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn_range: Range<Lsn>,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn_range,
-        })
-    }
-
-    pub async fn put_value(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_delta_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                key,
-                self.lsn_range.clone(),
-                ctx,
-            )
-            .await?;
-            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            self.generated_layers.push(delta_layer);
-        }
-        self.inner.put_value(key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-
-        let (desc, path) = inner.finish(end_key, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-        generated_layers.push(delta_layer);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::AsLayerDesc,
-        },
-        DEFAULT_PG_VERSION,
-    };
-
-    use super::*;
-
-    fn get_key(id: u32) -> Key {
-        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        key.field6 = id;
-        key
-    }
-
-    fn get_img(id: u32) -> Bytes {
-        format!("{id:064}").into()
-    }
-
-    fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
-    }
-
-    #[tokio::test]
-    async fn write_one_image() {
-        let harness = TenantHarness::create("split_writer_write_one_image")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-    }
-
-    #[tokio::test]
-    async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        const N: usize = 2000;
-        for i in 0..N {
-            let i = i as u32;
-            image_writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
-                .await
-                .unwrap();
-            delta_writer
-                .put_value(
-                    get_key(i),
-                    Lsn(0x20),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                )
-                .await
-                .unwrap();
-        }
-        let image_layers = image_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].layer_desc().key_range.end,
-                    image_layers[idx].layer_desc().key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].layer_desc().key_range.end,
-                    delta_layers[idx].layer_desc().key_range.start
-                );
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn write_large_img() {
-        let harness = TenantHarness::create("split_writer_write_large_img")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        image_writer
-            .put_image(get_key(1), get_large_img(), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        delta_writer
-            .put_value(
-                get_key(1),
-                Lsn(0x1A),
-                Value::Image(get_large_img()),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -129,9 +129,11 @@ pub fn start_background_loops(
            let background_jobs_can_start = background_jobs_can_start.cloned();
            async move {
                let cancel = task_mgr::shutdown_token();
+                let can_start = completion::Barrier::maybe_wait(background_jobs_can_start);
+                let can_start = tenant.ongoing_timeline_detach.gc_sleeping_while(can_start);
                tokio::select! {
                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = can_start => {}
                };
                gc_loop(tenant, cancel)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -210,28 +212,24 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                Duration::from_secs(10)
            } else {
                // Run compaction
-                match tenant.compaction_iteration(&cancel, &ctx).await {
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-                        log_compaction_error(
-                            &e,
-                            error_run_count,
-                            &wait_duration,
-                            cancel.is_cancelled(),
-                        );
-                        wait_duration
-                    }
-                    Ok(has_pending_task) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::from_secs(0) } else { period }
-                    }
+                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    log_compaction_error(
+                        &e,
+                        error_run_count,
+                        &wait_duration,
+                        cancel.is_cancelled(),
+                    );
+                    wait_duration
+                } else {
+                    error_run_count = 0;
+                    period
                }
            };

@@ -365,14 +363,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            if first {
                first = false;

-                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
-                    .await
-                    .is_err()
-                {
-                    break;
-                }
+                let delays = async {
+                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
+                    random_init_delay(period, &cancel).await?;
+                    Ok::<_, Cancelled>(())
+                };

-                if random_init_delay(period, &cancel).await.is_err() {
+                if tenant.ongoing_timeline_detach.gc_sleeping_while(delays).await.is_err() {
                    break;
                }
            }
@@ -407,16 +404,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        error_run_count += 1;
                        let wait_duration = Duration::from_secs_f64(wait_duration);

-                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
-                            // Timeline was cancelled during gc. We might either be in an event
-                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
-                            // or in one that affects the timeline only (timeline deletion).
-                            // Therefore, don't exit the loop.
-                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        } else {
-                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        }
-
+                        error!(
+                            "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                        );
                        wait_duration
                    }
                }
@@ -425,7 +415,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

            // Sleep
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
+            let cancelled = cancel.cancelled();
+            let cancelled = tenant.ongoing_timeline_detach.gc_sleeping_while(cancelled);
+            if tokio::time::timeout(sleep_duration, cancelled)
                .await
                .is_ok()
            {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,19 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    // Always ensure the lock order is compaction -> gc.
-    let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
-        compaction_lock,
-        "acquires compaction lock",
-        std::time::Duration::from_secs(5),
-    )
-    .await;
-
-    let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
-        gc_lock,
-        "acquires gc lock",
+    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
+    let guards = crate::timed(
+        guards,
+        "acquire gc and compaction locks",
        std::time::Duration::from_secs(5),
    )
    .await;
@@ -116,8 +107,7 @@ pub(super) async fn delete_local_timeline_directory(
        .context("fsync_pre_mark_remove")?;

    info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
+    drop(guards);

    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -216,10 +206,11 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all)]
+    #[instrument(skip_all, fields(%inplace))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
+        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();

@@ -230,7 +221,7 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

-        tenant.gc_block.before_delete(&timeline);
+        tenant.ongoing_timeline_detach.on_delete(&timeline);

        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
@@ -246,7 +237,11 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -213,45 +213,51 @@ impl Timeline {
        let mut js = tokio::task::JoinSet::new();
        {
            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            for layer in layers.iter_historic_layers() {
+                let layer = guard.get_from_desc(&layer);

-            guard
-                .likely_resident_layers()
-                .filter(|layer| {
-                    let last_activity_ts = layer.latest_activity();
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.

-                    let no_activity_for = match now.duration_since(last_activity_ts) {
-                        Ok(d) => d,
-                        Err(_e) => {
-                            // We reach here if `now` < `last_activity_ts`, which can legitimately
-                            // happen if there is an access between us getting `now`, and us getting
-                            // the access stats from the layer.
-                            //
-                            // The other reason why it can happen is system clock skew because
-                            // SystemTime::now() is not monotonic, so, even if there is no access
-                            // to the layer after we get `now` at the beginning of this function,
-                            // it could be that `now`  < `last_activity_ts`.
-                            //
-                            // To distinguish the cases, we would need to record `Instant`s in the
-                            // access stats (i.e., monotonic timestamps), but then, the timestamps
-                            // values in the access stats would need to be `Instant`'s, and hence
-                            // they would be meaningless outside of the pageserver process.
-                            // At the time of writing, the trade-off is that access stats are more
-                            // valuable than detecting clock skew.
-                            return false;
-                        }
-                    };
+                if !layer.is_likely_resident() {
+                    continue;
+                }

-                    no_activity_for > p.threshold
-                })
-                .cloned()
-                .for_each(|layer| {
+                let last_activity_ts = layer.access_stats().latest_activity();
+
+                let no_activity_for = match now.duration_since(last_activity_ts) {
+                    Ok(d) => d,
+                    Err(_e) => {
+                        // We reach here if `now` < `last_activity_ts`, which can legitimately
+                        // happen if there is an access between us getting `now`, and us getting
+                        // the access stats from the layer.
+                        //
+                        // The other reason why it can happen is system clock skew because
+                        // SystemTime::now() is not monotonic, so, even if there is no access
+                        // to the layer after we get `now` at the beginning of this function,
+                        // it could be that `now`  < `last_activity_ts`.
+                        //
+                        // To distinguish the cases, we would need to record `Instant`s in the
+                        // access stats (i.e., monotonic timestamps), but then, the timestamps
+                        // values in the access stats would need to be `Instant`'s, and hence
+                        // they would be meaningless outside of the pageserver process.
+                        // At the time of writing, the trade-off is that access stats are more
+                        // valuable than detecting clock skew.
+                        continue;
+                    }
+                };
+
+                if no_activity_for > p.threshold {
                    js.spawn(async move {
                        layer
                            .evict_and_wait(std::time::Duration::from_secs(5))
                            .await
                    });
                    stats.candidates += 1;
-                });
+                }
+            }
        };

        let join_all = async move {
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,967 +0,0 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
-//!
-//! # Motivation
-//!
-//! On a single page service connection, we're typically serving a single TenantTimelineId.
-//!
-//! Without sharding, there is a single Timeline object to which we dispatch
-//! all requests. For example, a getpage request gets dispatched to the
-//! Timeline::get method of the Timeline object that represents the
-//! (tenant,timeline) of that connection.
-//!
-//! With sharding, for each request that comes in on the connection,
-//! we first have to perform shard routing based on the requested key (=~ page number).
-//! The result of shard routing is a Timeline object.
-//! We then dispatch the request to that Timeline object.
-//!
-//! Regardless of whether the tenant is sharded or not, we want to ensure that
-//! we hold the Timeline gate open while we're invoking the method on the
-//! Timeline object.
-//!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
-//!
-//! Regardless of how we accomplish the above, it should not
-//! prevent the Timeline from shutting down promptly.
-//!
-//! # Design
-//!
-//! There are three user-facing data structures:
-//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
-//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
-//!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
-//!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
-//!
-//! To dispatch a request, the page service connection calls `Cache::get`.
-//!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
-//!
-//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
-//!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
-//!
-//! # Memory Management / How The Reference Cycle Is Broken
-//!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
-//!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
-//!
-//! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
-//!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
-//! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
-//!
-//! # Fast Path for Shard Routing
-//!
-//! The `Cache` has a fast path for shard routing to avoid calling into
-//! the tenant manager for every request.
-//!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
-//!
-//! The current implementation uses the first entry in the hash map
-//! to determine the `ShardParameters` and derive the correct
-//! `ShardIndex` for the requested key.
-//!
-//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
-//!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
-//! it's a hit.
-//!
-//! ## Cache invalidation
-//!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
-//! The only reasons why an entry in the cache can become stale are:
-//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
-//!    being detached, timeline or shard deleted, or pageserver is shutting down.
-//! 2. We're doing a shard split and new traffic should be routed to the child shards.
-//!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
-//! timeline has shut down, and when that happens, we remove the entry from the cache.
-//!
-//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
-//! to the parent shard during a shard split. Eventually, the shard split task will
-//! shut down the parent => case (1).
-
-use std::collections::hash_map;
-use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::sync::Weak;
-
-use pageserver_api::shard::ShardIdentity;
-use tracing::instrument;
-use tracing::trace;
-use utils::id::TimelineId;
-use utils::shard::ShardIndex;
-use utils::shard::ShardNumber;
-
-use crate::tenant::mgr::ShardSelector;
-
-/// The requirement for Debug is so that #[derive(Debug)] works in some places.
-pub(crate) trait Types: Sized + std::fmt::Debug {
-    type TenantManagerError: Sized + std::fmt::Debug;
-    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
-}
-
-/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
-/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
-/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
-struct CacheId(u64);
-
-impl CacheId {
-    fn next() -> Self {
-        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("CacheId::new() returned 0, overflow");
-        }
-        Self(id)
-    }
-}
-
-/// See module-level comment.
-pub(crate) struct Cache<T: Types> {
-    id: CacheId,
-    map: Map<T>,
-}
-
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
-
-impl<T: Types> Default for Cache<T> {
-    fn default() -> Self {
-        Self {
-            id: CacheId::next(),
-            map: Default::default(),
-        }
-    }
-}
-
-#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
-pub(crate) struct ShardTimelineId {
-    pub(crate) shard_index: ShardIndex,
-    pub(crate) timeline_id: TimelineId,
-}
-
-/// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
-///
-/// See module-level comment for details.
-pub struct PerTimelineState<T: Types> {
-    // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
-}
-
-impl<T: Types> Default for PerTimelineState<T> {
-    fn default() -> Self {
-        Self {
-            handles: Mutex::new(Some(Default::default())),
-        }
-    }
-}
-
-/// Abstract view of [`crate::tenant::mgr`], for testability.
-pub(crate) trait TenantManager<T: Types> {
-    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
-    /// Errors are returned as [`GetError::TenantManager`].
-    async fn resolve(
-        &self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> Result<T::Timeline, T::TenantManagerError>;
-}
-
-/// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
-    fn shard_timeline_id(&self) -> ShardTimelineId;
-    fn get_shard_identity(&self) -> &ShardIdentity;
-    fn per_timeline_state(&self) -> &PerTimelineState<T>;
-}
-
-/// Errors returned by [`Cache::get`].
-#[derive(Debug)]
-pub(crate) enum GetError<T: Types> {
-    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
-    PerTimelineStateShutDown,
-}
-
-/// Internal type used in [`Cache::get`].
-enum RoutingResult<T: Types> {
-    FastPath(Handle<T>),
-    SlowPath(ShardTimelineId),
-    NeedConsultTenantManager,
-}
-
-impl<T: Types> Cache<T> {
-    /// See module-level comment for details.
-    ///
-    /// Does NOT check for the shutdown state of [`Types::Timeline`].
-    /// Instead, the methods of [`Types::Timeline`] that are invoked through
-    /// the [`Handle`] are responsible for checking these conditions
-    /// and if so, return an error that causes the page service to
-    /// close the connection.
-    #[instrument(level = "trace", skip_all)]
-    pub(crate) async fn get(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
-            let routing_state = self.shard_routing(timeline_id, shard_selector);
-            match routing_state {
-                RoutingResult::FastPath(handle) => return Ok(handle),
-                RoutingResult::SlowPath(key) => match self.map.get(&key) {
-                    Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
-                            trace!("handle cache stale");
-                            self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
-                        }
-                    },
-                    None => ShardSelector::Known(key.shard_index),
-                },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
-            }
-        };
-        self.get_miss(timeline_id, miss, tenant_manager).await
-    }
-
-    #[inline(always)]
-    fn shard_routing(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> RoutingResult<T> {
-        loop {
-            // terminates because when every iteration we remove an element from the map
-            let Some((first_key, first_handle)) = self.map.iter().next() else {
-                return RoutingResult::NeedConsultTenantManager;
-            };
-            let Some(first_handle) = first_handle.upgrade() else {
-                // TODO: dedup with get()
-                trace!("handle cache stale");
-                let first_key_owned = *first_key;
-                self.map.remove(&first_key_owned).unwrap();
-                continue;
-            };
-
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
-            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_handle_shard_identity.count,
-            };
-
-            let need_idx = match shard_selector {
-                ShardSelector::Page(key) => {
-                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
-                }
-                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
-                ShardSelector::Known(shard_idx) => shard_idx,
-            };
-            let need_shard_timeline_id = ShardTimelineId {
-                shard_index: need_idx,
-                timeline_id,
-            };
-            let first_handle_shard_timeline_id = ShardTimelineId {
-                shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
-            };
-
-            if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
-            } else {
-                return RoutingResult::SlowPath(need_shard_timeline_id);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    #[inline(always)]
-    async fn get_miss(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        match tenant_manager.resolve(timeline_id, shard_selector).await {
-            Ok(timeline) => {
-                let key = timeline.shard_timeline_id();
-                match &shard_selector {
-                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
-                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
-                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
-                }
-
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
-                trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
-                    let mut lock_guard = timeline
-                        .per_timeline_state()
-                        .handles
-                        .lock()
-                        .expect("mutex poisoned");
-                    match &mut *lock_guard {
-                        Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
-                            assert!(replaced.is_none(), "some earlier code left a stale handle");
-                            match self.map.entry(key) {
-                                hash_map::Entry::Occupied(_o) => {
-                                    // This cannot not happen because
-                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
-                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
-                                    //    while we were waiting for the tenant manager.
-                                    unreachable!()
-                                }
-                                hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
-                                }
-                            }
-                        }
-                        None => {
-                            return Err(GetError::PerTimelineStateShutDown);
-                        }
-                    }
-                };
-                Ok(Handle(handle))
-            }
-            Err(e) => Err(GetError::TenantManager(e)),
-        }
-    }
-}
-
-impl<T: Types> PerTimelineState<T> {
-    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
-    /// to the [`Types::Timeline`] that embeds this per-timeline state.
-    /// Even if [`TenantManager::resolve`] would still resolve to it.
-    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
-    /// That's ok because they're short-lived. See module-level comment for details.
-    #[instrument(level = "trace", skip_all)]
-    pub(super) fn shutdown(&self) {
-        let handles = self
-            .handles
-            .lock()
-            .expect("mutex poisoned")
-            // NB: this .take() sets locked to None.
-            // That's what makes future `Cache::get` misses fail.
-            // Cache hits are taken care of below.
-            .take();
-        let Some(handles) = handles else {
-            trace!("already shut down");
-            return;
-        };
-        for handle in handles.values() {
-            // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
-        }
-        drop(handles);
-    }
-}
-
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
-// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
-impl<T: Types> Drop for Cache<T> {
-    fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver_api::{
-        key::{rel_block_to_key, Key, DBDIR_KEY},
-        models::ShardParameters,
-        reltag::RelTag,
-        shard::ShardStripeSize,
-    };
-    use utils::shard::ShardCount;
-
-    use super::*;
-
-    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
-
-    #[derive(Debug)]
-    struct TestTypes;
-    impl Types for TestTypes {
-        type TenantManagerError = anyhow::Error;
-        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
-    }
-
-    struct StubManager {
-        shards: Vec<Arc<StubTimeline>>,
-    }
-
-    struct StubTimeline {
-        gate: utils::sync::gate::Gate,
-        id: TimelineId,
-        shard: ShardIdentity,
-        per_timeline_state: PerTimelineState<TestTypes>,
-        myself: Weak<StubTimeline>,
-    }
-
-    impl StubTimeline {
-        fn getpage(&self) {
-            // do nothing
-        }
-    }
-
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
-        fn shard_timeline_id(&self) -> ShardTimelineId {
-            ShardTimelineId {
-                shard_index: self.shard.shard_index(),
-                timeline_id: self.id,
-            }
-        }
-
-        fn get_shard_identity(&self) -> &ShardIdentity {
-            &self.shard
-        }
-
-        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
-            &self.per_timeline_state
-        }
-    }
-
-    impl TenantManager<TestTypes> for StubManager {
-        async fn resolve(
-            &self,
-            timeline_id: TimelineId,
-            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
-            for timeline in &self.shards {
-                if timeline.id == timeline_id {
-                    match &shard_selector {
-                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Zero => continue,
-                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Page(_) => continue,
-                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Known(_) => continue,
-                    }
-                }
-            }
-            anyhow::bail!("not found")
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_timeline_shutdown() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        //
-        // fill the cache
-        //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        let handle: Handle<_> = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
-        assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-        drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-
-        //
-        // demonstrate that Handle holds up gate closure
-        // but shutdown prevents new handles from being handed out
-        //
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("cache and per-timeline handler state keep cache open");
-            }
-            _ = tokio::time::sleep(FOREVER) => {
-                // NB: first poll of close() makes it enter closing state
-            }
-        }
-
-        let handle = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-
-        // SHUTDOWN
-        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
-
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
-        assert_eq!(
-            cache.map.len(),
-            1,
-            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
-
-        // this handle is perfectly usable
-        handle.getpage();
-
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
-        assert_eq!(
-            cache.map.len(),
-            0,
-            "first access after shutdown cleans up the Weak's from the cache"
-        );
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-
-        drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        // closing gate succeeds after dropping handle
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-
-        // map gets cleaned on next lookup
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 0);
-
-        // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
-        drop(shard0);
-        drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
-    }
-
-    #[tokio::test]
-    async fn test_multiple_timelines_and_deletion() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_a = TimelineId::generate();
-        let timeline_b = TimelineId::generate();
-        assert_ne!(timeline_a, timeline_b);
-        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_a,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_b,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mut mgr = StubManager {
-            shards: vec![timeline_a.clone(), timeline_b.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        assert_eq!(cache.map.len(), 2);
-
-        // delete timeline A
-        timeline_a.per_timeline_state.shutdown();
-        mgr.shards.retain(|t| t.id != timeline_a.id);
-        assert!(
-            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
-                .await
-                .is_err(),
-            "broken StubManager implementation"
-        );
-
-        assert_eq!(
-            cache.map.len(),
-            2,
-            "cache still has a Weak handle to Timeline A"
-        );
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
-
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we still have it");
-    }
-
-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
-        rel_block_to_key(
-            RelTag {
-                spcnode: 1663,
-                dbnode: 208101,
-                relnode: 2620,
-                forknum: 0,
-            },
-            shard.0 as u32 * params.stripe_size.0,
-        )
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_shard_split() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let parent = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_params = ShardParameters {
-            count: ShardCount(2),
-            stripe_size: ShardStripeSize::default(),
-        };
-        let child0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child1 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        // fill the cache with the parent
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![parent.clone()],
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent first"
-            );
-            drop(handle);
-        }
-
-        //
-        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
-        //
-
-        // while we haven't shut down the parent, the cache will return the cached parent, even
-        // if the tenant manager returns the child
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent"
-            );
-            drop(handle);
-        }
-
-        let parent_handle = cache
-            .get(
-                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
-                &StubManager {
-                    shards: vec![parent.clone()],
-                },
-            )
-            .await
-            .expect("we have it");
-        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
-
-        // invalidate the cache
-        parent.per_timeline_state.shutdown();
-
-        // the cache will now return the child, even though the parent handle still exists
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(
-                    &handle.myself,
-                    &child_shards_by_shard_number[i as usize].myself
-                ),
-                "mgr returns child"
-            );
-            drop(handle);
-        }
-
-        // all the while the parent handle kept the parent gate open
-        tokio::select! {
-            _ = parent_handle.gate.close() => {
-                panic!("parent handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-        drop(parent_handle);
-        tokio::select! {
-            _ = parent.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("parent handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_connection_handler_exit() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
-        for _ in 0..10 {
-            let mut cache = Cache::<TestTypes>::default();
-            let handle = {
-                let handle = cache
-                    .get(timeline_id, ShardSelector::Page(key), &mgr)
-                    .await
-                    .expect("we have the timeline");
-                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-                handle
-            };
-            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
-        }
-
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, ensure, Context, Result};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
@@ -24,142 +24,35 @@ use crate::{
 use super::TimelineWriterState;

 /// Provides semantic APIs to manipulate the layer map.
-pub(crate) enum LayerManager {
-    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
-    /// the layers.
-    Open(OpenLayerManager),
-    /// Shutdown layer manager where there are no more in-memory layers and persistent layers are
-    /// read-only.
-    Closed {
-        layers: HashMap<PersistentLayerKey, Layer>,
-    },
-}
-
-impl Default for LayerManager {
-    fn default() -> Self {
-        LayerManager::Open(OpenLayerManager::default())
-    }
+#[derive(Default)]
+pub(crate) struct LayerManager {
+    layer_map: LayerMap,
+    layer_fmgr: LayerFileManager<Layer>,
 }

 impl LayerManager {
-    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.layers()
-            .get(key)
-            .with_context(|| format!("get layer from key: {key}"))
-            .expect("not found")
-            .clone()
-    }
-
    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
-        self.get_from_key(&desc.key())
+        self.layer_fmgr.get_from_desc(desc)
    }

    /// Get an immutable reference to the layer map.
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
    /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
-            Closed { .. } => Err(Shutdown),
-        }
+    pub(crate) fn layer_map(&self) -> &LayerMap {
+        &self.layer_map
    }

-    pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
-        use LayerManager::*;
-
-        match self {
-            Open(open) => Ok(open),
-            Closed { .. } => Err(Shutdown),
-        }
-    }
-
-    /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
-    /// order to allow shutdown to complete.
-    ///
-    /// If there was a want to flush in-memory layers, it must have happened earlier.
-    pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager {
-                layer_map,
-                layer_fmgr: LayerFileManager(hashmap),
-            }) => {
-                let open = layer_map.open_layer.take();
-                let frozen = layer_map.frozen_layers.len();
-                let taken_writer_state = writer_state.take();
-                tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
-                let layers = std::mem::take(hashmap);
-                *self = Closed { layers };
-                assert_eq!(open.is_some(), taken_writer_state.is_some());
-            }
-            Closed { .. } => {
-                tracing::debug!("ignoring multiple shutdowns on layer manager")
-            }
-        }
-    }
-
-    /// Sum up the historic layer sizes
-    pub(crate) fn layer_size_sum(&self) -> u64 {
-        self.layers()
-            .values()
-            .map(|l| l.layer_desc().file_size)
-            .sum()
-    }
-
-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
-        self.layers().values().filter(|l| l.is_likely_resident())
-    }
-
-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
-        self.contains_key(&layer.layer_desc().key())
-    }
-
-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layers().contains_key(key)
-    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layers().keys().cloned().collect_vec()
-    }
-
-    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
-            Closed { layers } => layers,
-        }
-    }
-}
-
-#[derive(Default)]
-pub(crate) struct OpenLayerManager {
-    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
-}
-
-impl std::fmt::Debug for OpenLayerManager {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("OpenLayerManager")
-            .field("layer_count", &self.layer_fmgr.0.len())
-            .finish()
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-#[error("layer manager has been shutdown")]
-pub(crate) struct Shutdown;
-
-impl OpenLayerManager {
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
+    pub(crate) fn initialize_local_layers(
+        &mut self,
+        on_disk_layers: Vec<Layer>,
+        next_open_layer_at: Lsn,
+    ) {
        let mut updates = self.layer_map.batch_update();
-        for layer in layers {
+        for layer in on_disk_layers {
            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
@@ -171,19 +64,26 @@ impl OpenLayerManager {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

-    /// Open a new writable layer to append data if there is no open layer, otherwise return the
-    /// current open layer, called within `get_layer_for_write`.
+    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
+    /// called within `get_layer_for_write`.
    pub(crate) async fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
+        last_record_lsn: Lsn,
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<InMemoryLayer>> {
+    ) -> Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());

+        ensure!(
+            lsn > last_record_lsn,
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
+            lsn,
+            last_record_lsn,
+        );
+
        // Do we have a layer open for writing already?
        let layer = if let Some(open_layer) = &self.layer_map.open_layer {
            if open_layer.get_lsn_range().start > lsn {
@@ -209,15 +109,8 @@ impl OpenLayerManager {
                lsn
            );

-            let new_layer = InMemoryLayer::create(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_lsn,
-                gate_guard,
-                ctx,
-            )
-            .await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
@@ -271,7 +164,7 @@ impl OpenLayerManager {
        froze
    }

-    /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
+    /// Add image layers to the layer map, called from `create_image_layers`.
    pub(crate) fn track_new_image_layers(
        &mut self,
        image_layers: &[ResidentLayer],
@@ -344,7 +237,7 @@ impl OpenLayerManager {
        self.finish_compact_l0(compact_from, compact_to, metrics)
    }

-    /// Called post-compaction when some previous generation image layers were trimmed.
+    /// Called when compaction is completed.
    pub(crate) fn rewrite_layers(
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
@@ -362,10 +255,13 @@ impl OpenLayerManager {
                new_layer.layer_desc().lsn_range
            );

-            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
            // always marking rewritten layers as visible.
-            new_layer.as_ref().set_visibility(old_layer.visibility());
+            new_layer
+                .as_ref()
+                .access_stats()
+                .set_visibility(old_layer.access_stats().visibility());

            // Safety: we may never rewrite the same file in-place.  Callers are responsible
            // for ensuring that they only rewrite layers after something changes the path,
@@ -433,6 +329,31 @@ impl OpenLayerManager {
        mapping.remove(layer);
        layer.delete_on_drop();
    }
+
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
+        // for small layer maps, we most likely have all resident, but for larger more are likely
+        // to be evicted assuming lots of layers correlated with longer lifespan.
+
+        self.layer_map().iter_historic_layers().filter_map(|desc| {
+            self.layer_fmgr
+                .0
+                .get(&desc.key())
+                .filter(|l| l.is_likely_resident())
+                .cloned()
+        })
+    }
+
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+        self.layer_fmgr.contains(layer)
+    }
+
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layer_fmgr.contains_key(key)
+    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layer_fmgr.0.keys().cloned().collect_vec()
+    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -444,6 +365,20 @@ impl<T> Default for LayerFileManager<T> {
 }

 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.0
+            .get(&desc.key())
+            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
+            .expect("not found")
+            .clone()
+    }
+
+    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.0.contains_key(key)
+    }
+
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
@@ -451,6 +386,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        }
    }

+    pub(crate) fn contains(&self, layer: &T) -> bool {
+        self.0.contains_key(&layer.layer_desc().key())
+    }
+
    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,10 +122,6 @@ impl CurrentLogicalSize {
            Self::Exact(_) => Accuracy::Exact,
        }
    }
-
-    pub(crate) fn is_exact(&self) -> bool {
-        matches!(self, Self::Exact(_))
-    }
 }

 impl LogicalSize {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -30,12 +30,10 @@ use tokio::time::Instant;
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
 pub use io_engine::feature_test as io_engine_feature_test;
-pub use io_engine::io_engine_for_bench;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -328,29 +328,3 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
    .join()
    .unwrap()
 }
-
-/// For use in benchmark binaries only.
-///
-/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
-/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
-/// developer time trying to figure out why it's slow.
-///
-/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
-pub fn io_engine_for_bench() -> IoEngineKind {
-    #[cfg(not(target_os = "linux"))]
-    {
-        panic!("This benchmark does I/O and can only give a representative result on Linux");
-    }
-    #[cfg(target_os = "linux")]
-    {
-        match feature_test().unwrap() {
-            FeatureTestResult::PlatformPreferred(engine) => engine,
-            FeatureTestResult::Worse {
-                engine: _engine,
-                remark,
-            } => {
-                panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
-            }
-        }
-    }
-}
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -618,7 +618,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

@@ -685,7 +685,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

@@ -752,7 +752,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

@@ -920,7 +920,7 @@ impl WalIngest {
                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                0
                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
                            };
                        assert_eq!(offset_array_len, buf.remaining());

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -241,9 +241,6 @@ impl PostgresRedoManager {

    /// Shut down the WAL redo manager.
    ///
-    /// Returns `true` if this call was the one that initiated shutdown.
-    /// `true` may be observed by no caller if the first caller stops polling.
-    ///
    /// After this future completes
    /// - no redo process is running
    /// - no new redo process will be spawned
@@ -253,32 +250,22 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn shutdown(&self) -> bool {
+    pub async fn shutdown(&self) {
        // prevent new processes from being spawned
-        let maybe_permit = match self.redo_process.get_or_init_detached().await {
+        let permit = match self.redo_process.get_or_init_detached().await {
            Ok(guard) => {
-                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
-                    None
-                } else {
-                    let (proc, permit) = guard.take_and_deinit();
-                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
-                    Some(permit)
-                }
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
            }
-            Err(permit) => Some(permit),
-        };
-        let it_was_us = if let Some(permit) = maybe_permit {
-            self.redo_process
-                .set(ProcessOnceCell::ManagerShutDown, permit);
-            true
-        } else {
-            false
+            Err(permit) => permit,
        };
+        self.redo_process
+            .set(ProcessOnceCell::ManagerShutDown, permit);
        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
        // for the underlying process.
        self.launched_processes.close().await;
-        it_was_us
    }

    /// This type doesn't have its own background task to check for idleness: we
--- a/pageserver/test_data/indices/mixed_workload/README.md
+++ b/pageserver/test_data/indices/mixed_workload/README.md
@@ -1,7 +0,0 @@
-
-# This was captured from one shard of a large tenant in staging.
-
-# It has a mixture of deltas and image layers, >1000 layers in total.
-
-# This is suitable for general smoke tests that want an index which is not
-# trivially small, but doesn't contain weird/pathological cases.
--- a/pageserver/test_data/indices/mixed_workload/index_part.json
+++ b/pageserver/test_data/indices/mixed_workload/index_part.json
--- a/poetry.lock
+++ b/poetry.lock
@@ -870,96 +870,6 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}

-[[package]]
-name = "clickhouse-connect"
-version = "0.7.17"
-description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
-optional = false
-python-versions = "~=3.8"
-files = [
-    {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
-    {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
-    {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
-    {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
-    {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
-    {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
-    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
-    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
-    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
-]
-
-[package.dependencies]
-certifi = "*"
-lz4 = "*"
-pytz = "*"
-urllib3 = ">=1.26"
-zstandard = "*"
-
-[package.extras]
-arrow = ["pyarrow"]
-numpy = ["numpy"]
-orjson = ["orjson"]
-pandas = ["pandas"]
-sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
-tzlocal = ["tzlocal (>=4.0)"]
-
 [[package]]
 name = "colorama"
 version = "0.4.5"
@@ -1514,20 +1424,6 @@ files = [
 [package.dependencies]
 six = "*"

-[[package]]
-name = "kafka-python"
-version = "2.0.2"
-description = "Pure Python client for Apache Kafka"
-optional = false
-python-versions = "*"
-files = [
-    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
-    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
-]
-
-[package.extras]
-crc32c = ["crc32c"]
-
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -1574,56 +1470,6 @@ files = [
    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]

-[[package]]
-name = "lz4"
-version = "4.3.3"
-description = "LZ4 Bindings for Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
-    {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
-    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
-    {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
-    {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
-    {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
-    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
-    {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
-    {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
-    {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
-    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
-    {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
-    {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
-    {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
-    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
-    {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
-    {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
-    {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
-    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
-    {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
-    {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
-    {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
-]
-
-[package.extras]
-docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
-flake8 = ["flake8"]
-tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
-
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -2515,17 +2361,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"

-[[package]]
-name = "pytz"
-version = "2024.1"
-description = "World timezone definitions, modern and historical"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
-    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -3371,4 +3206,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,7 +92,6 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
-try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -218,7 +218,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
    pub fn check_rate_limit(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        config: &AuthenticationConfig,
        secret: AuthSecret,
        endpoint: &EndpointId,
@@ -243,7 +243,7 @@ impl AuthenticationConfig {
        let limit_not_exceeded = self.rate_limiter.check(
            (
                endpoint_int,
-                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
            ),
            password_weight,
        );
@@ -274,7 +274,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    api: &impl console::Api,
    user_info: ComputeUserInfoMaybeEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -303,8 +303,8 @@ async fn auth_quirks(
    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;

    // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
    }

    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -356,7 +356,7 @@ async fn auth_quirks(
 }

 async fn authenticate_with_secret(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    secret: AuthSecret,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
        self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
@@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 impl BackendType<'_, ComputeUserInfo, &()> {
    pub async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {

    pub async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -571,7 +571,7 @@ mod tests {
    impl console::Api for Auth {
        async fn get_role_secret(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -579,7 +579,7 @@ mod tests {

        async fn get_allowed_ips_and_secret(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
        {
@@ -591,7 +591,7 @@ mod tests {

        async fn wake_compute(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
            unimplemented!()
@@ -665,7 +665,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -723,7 +723,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
@@ -742,7 +742,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -775,7 +775,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
@@ -794,7 +794,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -828,7 +828,7 @@ mod tests {
        ));

        let creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    creds: ComputeUserInfo,
    client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
        }
        AuthSecret::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret, ctx);
+            let scram = auth::Scram(&secret, &mut *ctx);

            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -18,7 +18,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

    let ep = EndpointIdInt::from(&info.endpoint);

@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    info: ComputeUserInfoNoEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 }

 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -84,7 +84,7 @@ pub fn endpoint_sni(

 impl ComputeUserInfoMaybeEndpoint {
    pub fn parse(
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        params: &StartupMessageParams,
        sni: Option<&str>,
        common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -264,8 +264,8 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -279,9 +279,9 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
        assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -311,8 +311,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -329,8 +329,8 @@ mod tests {
            ),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -344,8 +344,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -359,9 +359,9 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));

@@ -374,16 +374,16 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        Ok(())
@@ -397,9 +397,10 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
+        let mut ctx = RequestMonitoring::test();
+        let err =
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
+                .expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -416,9 +417,10 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let ctx = RequestMonitoring::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
+        let mut ctx = RequestMonitoring::test();
+        let err =
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
+                .expect_err("should fail");
        match err {
            UnknownCommonName { cn } => {
                assert_eq!(cn, "localhost");
@@ -436,9 +438,9 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
        assert_eq!(
            user_info.options.get_cache_key("project"),
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -27,7 +27,7 @@ pub trait AuthMethod {
 pub struct Begin;

 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);

 impl AuthMethod for Scram<'_> {
    #[inline(always)]
@@ -155,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        let Scram(secret, ctx) = self.state;

        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

        // Initial client message contains the chosen auth method's name.
        let msg = self.stream.read_password_message().await?;
@@ -168,8 +168,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        }

        match sasl.method {
-            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
-            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
+            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => {
+                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
+            }
            _ => {}
        }
        info!("client chooses {}", sasl.method);
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -205,7 +205,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";

 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    raw_stream: S,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
@@ -256,13 +256,13 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }

 async fn handle_client(
-    ctx: RequestMonitoring,
+    mut ctx: RequestMonitoring,
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;

    // Cut off first part of the SNI domain
    // We receive required destination details in the format of
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -5,7 +5,6 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
-use aws_config::Region;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::AuthRateLimiter;
@@ -291,10 +290,9 @@ async fn main() -> anyhow::Result<()> {
    let config = build_config(&args)?;

    info!("Authentication backend: {}", config.auth_backend);
-    info!("Using region: {}", args.aws_region);
+    info!("Using region: {}", config.aws_region);

-    let region_provider =
-        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
+    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
    let provider_conf =
        ProviderConfig::without_region().with_region(region_provider.region().await);
    let aws_credentials_provider = {
@@ -320,7 +318,7 @@ async fn main() -> anyhow::Result<()> {
    };
    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
        elasticache::AWSIRSAConfig::new(
-            args.aws_region.clone(),
+            config.aws_region.clone(),
            args.redis_cluster_name,
            args.redis_user_id,
        ),
@@ -378,14 +376,11 @@ async fn main() -> anyhow::Result<()> {

    let cancel_map = CancelMap::default();

-    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
-    RateBucketInfo::validate(redis_rps_limit)?;
-
    let redis_publisher = match &regional_redis_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
-            redis_rps_limit,
+            &config.redis_rps_limit,
        )?))),
        None => None,
    };
@@ -661,6 +656,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
+        request_timeout: args.sql_over_http.sql_over_http_timeout,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -680,6 +676,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
    };

+    let mut redis_rps_limit = args.redis_rps_limit.clone();
+    RateBucketInfo::validate(&mut redis_rps_limit)?;
+
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
@@ -688,8 +687,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        http_config,
        authentication_config,
        require_client_ip: args.require_client_ip,
+        disable_ip_check_for_http: args.disable_ip_check_for_http,
+        redis_rps_limit,
        handshake_timeout: args.handshake_timeout,
        region: args.region.clone(),
+        aws_region: args.aws_region.clone(),
        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
        connect_compute_locks,
        connect_to_compute_retry_config: config::RetryConfig::parse(
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -68,7 +68,7 @@ impl EndpointsCache {
            ready: AtomicBool::new(false),
        }
    }
-    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
        if !self.ready.load(Ordering::Acquire) {
            return true;
        }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -288,12 +288,12 @@ impl ConnCfg {
    /// Connect to a corresponding compute node.
    pub async fn connect(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        allow_self_signed_compute: bool,
        aux: MetricsAuxInfo,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
        drop(pause);

@@ -316,14 +316,14 @@ impl ConnCfg {
        )?;

        // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (client, connection) = self.0.connect_raw(stream, tls).await?;
        drop(pause);
        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
        let stream = connection.stream.into_inner();

        info!(
-            cold_start_info = ctx.cold_start_info().as_str(),
+            cold_start_info = ctx.cold_start_info.as_str(),
            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
            self.0.get_ssl_mode()
        );
@@ -342,7 +342,7 @@ impl ConnCfg {
            params,
            cancel_closure,
            aux,
-            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
        };

        Ok(connection)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -31,8 +31,11 @@ pub struct ProxyConfig {
    pub http_config: HttpConfig,
    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
+    pub disable_ip_check_for_http: bool,
+    pub redis_rps_limit: Vec<RateBucketInfo>,
    pub region: String,
    pub handshake_timeout: Duration,
+    pub aws_region: String,
    pub wake_compute_retry_config: RetryConfig,
    pub connect_compute_locks: ApiLocks<Host>,
    pub connect_to_compute_retry_config: RetryConfig,
@@ -52,6 +55,7 @@ pub struct TlsConfig {
 }

 pub struct HttpConfig {
+    pub request_timeout: tokio::time::Duration,
    pub pool_options: GlobalConnPoolOptions,
    pub cancel_set: CancelSet,
    pub client_conn_threshold: u64,
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -292,7 +292,7 @@ pub struct NodeInfo {
 impl NodeInfo {
    pub async fn connect(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        timeout: Duration,
    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
        self.config
@@ -330,20 +330,20 @@ pub(crate) trait Api {
    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
    async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
@@ -363,7 +363,7 @@ pub enum ConsoleBackend {
 impl Api for ConsoleBackend {
    async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        use ConsoleBackend::*;
@@ -378,7 +378,7 @@ impl Api for ConsoleBackend {

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
        use ConsoleBackend::*;
@@ -393,7 +393,7 @@ impl Api for ConsoleBackend {

    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
        use ConsoleBackend::*;
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -158,7 +158,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
        &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        Ok(CachedRoleSecret::new_uncached(
@@ -168,7 +168,7 @@ impl super::Api for Api {

    async fn get_allowed_ips_and_secret(
        &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        Ok((
@@ -182,7 +182,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &mut RequestMonitoring,
        _user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        self.do_wake_compute().map_ok(Cached::new_uncached).await
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Joonas Koivunen	dcfd92b6db	test: funroll-loop first iteration in test_retried_detach_ancestor_after_failed_reparenting	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f58636ffdd	test: refactor -- begin to -funroll-loops in test_retried_detach_ancestor_after_failed_reparenting	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f3ac5bcbe1	test: ensure gc is unpaused with the earlier deletion test	2024-07-26 14:39:32 +00:00
Joonas Koivunen	eb3711b881	doc: why no cancel	2024-07-26 14:39:32 +00:00
Joonas Koivunen	c864166b32	test: make sure gc gets unblocked by late deletion	2024-07-26 14:39:32 +00:00
Joonas Koivunen	ce9b5ae7bf	test: allow the 500 error crutch temporarily	2024-07-26 14:39:32 +00:00
Joonas Koivunen	cd2cbe0691	test: rename test_deletion_after_timeline_ancestor_detach_before_completion	2024-07-26 14:39:32 +00:00
Joonas Koivunen	7f241bd379	refactor: remove needless Error::from	2024-07-26 14:39:32 +00:00
Joonas Koivunen	ff52901028	refactor: still_ongoing assert is shared in all paths	2024-07-26 14:39:32 +00:00
Joonas Koivunen	bb377a3544	fixup: make sure detach_ancestor is blocking gc	2024-07-26 14:39:32 +00:00
Joonas Koivunen	5ece7af497	doc: remove confusing comment	2024-07-26 14:39:32 +00:00
Joonas Koivunen	2be3027fa5	doc: elaborate on weird query	2024-07-26 14:39:32 +00:00
Joonas Koivunen	14a0517c7f	also assert still ongoing	2024-07-26 14:39:32 +00:00
Joonas Koivunen	dcff25c293	chore: adjust assert message	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f80c37b733	chore: forgotten to update panic text with detach_and_reparent renaming	2024-07-26 14:39:32 +00:00
Joonas Koivunen	b9d0b26cea	doc: remove possibly wrong comment	2024-07-26 14:39:32 +00:00
Joonas Koivunen	c2c28f211b	doc: explain returning option	2024-07-26 14:39:32 +00:00
Joonas Koivunen	1ebcb1c45b	doc: clean out FIXME we cannot protect against willful misuse. I had been thinking of witness of Attempt but ...	2024-07-26 14:39:32 +00:00
Joonas Koivunen	66d750ec20	info log on detach	2024-07-26 14:39:32 +00:00
Joonas Koivunen	ba3a6645e7	fix: info log line again, botched rebase?	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8885a8c482	fixup: missed hashset change	2024-07-26 14:39:32 +00:00
Joonas Koivunen	c8880b69fb	stop with the (ancestor_lsn, timeline_id) ordered reparented I was thinking of the case where we have multiple reparented at the same ancestor_lsn. But of course, that is not a problem if we compare the reparented as a set...	2024-07-26 14:39:32 +00:00
Joonas Koivunen	274b2a611b	test: handle the case where timeline cannot be found at least do not double-panick.	2024-07-26 14:39:32 +00:00
Joonas Koivunen	a7153bf9b2	test: forgotten allowed errors	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8a4236a441	test: remove needless s3 storage	2024-07-26 14:39:32 +00:00
Joonas Koivunen	7ec927e43b	test: cleanup todos	2024-07-26 14:39:32 +00:00
Joonas Koivunen	22470ef444	test: comment	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8248cbb45b	test: ensure persisted gc blocking works across restart	2024-07-26 14:39:32 +00:00
Joonas Koivunen	4dd805b68a	test: remove the extra deletion which was confusing it had already been reparented, so it was not needed.	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f582675452	test: refactor repetition	2024-07-26 14:39:32 +00:00
Joonas Koivunen	48069f68bb	chore: forgotten pyfmt	2024-07-26 14:39:32 +00:00
Joonas Koivunen	8f52139913	additional assert in completion	2024-07-26 14:39:32 +00:00
Joonas Koivunen	fc4d80bbf2	elaborate on TODO for which a test is later added	2024-07-26 14:39:32 +00:00
Joonas Koivunen	dc83a5a978	fixup dae8c75c04 test: cannot be parametrized over return or exit	2024-07-26 14:39:32 +00:00
Joonas Koivunen	f4fb08d869	stop masking the topmost error in http handler	2024-07-26 14:39:31 +00:00
Joonas Koivunen	75b326faf4	test: complete fixmes	2024-07-26 14:39:31 +00:00
Joonas Koivunen	c23cd5c149	ongoing_detach_ancestor => gc_blocking in index_part	2024-07-26 14:39:31 +00:00
Joonas Koivunen	f4cd9fe40b	refactor: misc after attempt to add lock_in_reparentable	2024-07-26 14:39:31 +00:00
Joonas Koivunen	43af9484c0	doc: schedule_reparenting_and_wait	2024-07-26 14:39:31 +00:00
Joonas Koivunen	842bd4c2db	refactor: reparentable_timelines query out	2024-07-26 14:39:31 +00:00
Joonas Koivunen	ada9a46dca	remove done fixme, minor reformattting	2024-07-26 14:39:31 +00:00
Joonas Koivunen	742fcac7b9	refactor: use partialeq more	2024-07-26 14:39:31 +00:00
Joonas Koivunen	55aeeb5765	allow deleting timeline unblock gc	2024-07-26 14:39:31 +00:00
Joonas Koivunen	89426570d3	relax overly strict comparisons	2024-07-26 14:39:31 +00:00
Joonas Koivunen	7f767ca18e	fix: must_restart condition	2024-07-26 14:39:31 +00:00
Joonas Koivunen	1348dbf0f1	doc: comment cleanup	2024-07-26 14:39:31 +00:00
Joonas Koivunen	a179283f86	always notify gc_waiting when writing over the witness tracking	2024-07-26 14:39:31 +00:00
Joonas Koivunen	deb86c1ea1	remodel the return type	2024-07-26 14:39:31 +00:00
Joonas Koivunen	dfdf40916f	rename complete_detaching_from_ancestor it hasn't meant completing in a while now :)	2024-07-26 14:39:31 +00:00
Joonas Koivunen	c6d8015fe9	chore: clippy needless into_iter	2024-07-26 14:39:31 +00:00
Joonas Koivunen	b2233d557b	test: complicate to include added paths	2024-07-26 14:39:31 +00:00
Joonas Koivunen	ce2552ba67	minor comment update for FIXME about 503	2024-07-26 14:39:31 +00:00
Joonas Koivunen	f4d773bb89	refactor: unify t::s::Semaphore	2024-07-26 14:39:31 +00:00
Joonas Koivunen	6f28263428	refactor: failpoint all but one	2024-07-26 14:39:31 +00:00
Joonas Koivunen	1e380ea5af	refactor: Ancestor::Delete is not needed	2024-07-26 14:39:31 +00:00
Joonas Koivunen	8258385301	remove indentation level with exhaustive match	2024-07-26 14:39:31 +00:00
Joonas Koivunen	6a8f00dea0	fix: return reparented_direct_children in case we reparent nothing new	2024-07-26 14:39:31 +00:00
Joonas Koivunen	44cdb9fb58	refactor: reparented_direct_children query	2024-07-26 14:39:31 +00:00
Joonas Koivunen	cdfaf0700f	fix: bifurcate the detach+reparent step	2024-07-26 14:39:31 +00:00
Joonas Koivunen	881e1ad056	refactor: no need to collect reparentable here	2024-07-26 14:39:31 +00:00
Joonas Koivunen	bb3d70e24d	fix: properly cancel if any reparenting failed	2024-07-26 14:39:31 +00:00
Joonas Koivunen	c6c560e4c8	rewrite to include testing assertion	2024-07-26 14:39:31 +00:00
Joonas Koivunen	8dd332aed5	doc: remove unnecessary comment	2024-07-26 14:39:31 +00:00
Joonas Koivunen	5c03a17eb8	wip: some progress now we hit the todo! in "already detached" path.	2024-07-26 14:39:31 +00:00
Joonas Koivunen	402d66778e	make reparenting operations idempotent	2024-07-26 14:39:31 +00:00
Joonas Koivunen	39e2bc932f	prepare to reparent while gc blocked	2024-07-26 14:39:31 +00:00
Joonas Koivunen	5fc034fa7f	feat: block gc persistently until detach ancestor completes	2024-07-26 14:39:31 +00:00
Joonas Koivunen	f9b12def0b	add support for WaitToActivate errors	2024-07-26 14:39:31 +00:00
Joonas Koivunen	5d0071447c	partial: index_part.json support for ongoing_detach_ancestor	2024-07-26 14:39:31 +00:00
Joonas Koivunen	d9eba3f8c3	==== PR cut here?	2024-07-26 14:39:31 +00:00
Joonas Koivunen	409e2eff9e	fix: run upload_rewritten_layer in a span there was a weird failure observed with CI tests: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8430/10108870590/index.html#suites/a1c2be32556270764423c495fad75d47/94a4686382b96297	2024-07-26 14:39:31 +00:00
Joonas Koivunen	e6e3b9a716	doc: remove on_gc_task_start fixme	2024-07-26 08:52:55 +00:00
Joonas Koivunen	7f31a3f671	forgotten rename, maybe	2024-07-26 08:52:55 +00:00
Joonas Koivunen	9971ae3d24	rename is_detached_from_{original_,}ancestor (just the rename)	2024-07-26 08:52:55 +00:00
Joonas Koivunen	48a2a20de3	chore: derive default	2024-07-26 08:52:55 +00:00
Joonas Koivunen	29ef8f15ce	chore: unused variable	2024-07-26 08:52:55 +00:00
Joonas Koivunen	5e45dd3f86	rename SharedState::notify to continue_existing_attempt	2024-07-26 08:52:55 +00:00
Joonas Koivunen	5fced442d7	warning caused by removed body	2024-07-26 08:52:55 +00:00
Joonas Koivunen	4222610233	cleanup index part dependent	2024-07-26 08:52:55 +00:00
Joonas Koivunen	92deb0dfd7	plumbing: collect timelines index parts	2024-07-26 08:52:55 +00:00
Joonas Koivunen	46ca6f17c5	plumbing: notify shared state of existing attempt	2024-07-26 08:52:55 +00:00
Joonas Koivunen	14869abb77	complete the plumbing with non-notifying attempt_blocks_gc impl	2024-07-26 08:52:55 +00:00
Joonas Koivunen	5330fd9366	doc(fixme): shared state	2024-07-26 08:52:55 +00:00
Joonas Koivunen	6c5b3b7812	doc: more sketched api comments	2024-07-26 08:52:55 +00:00
Joonas Koivunen	849fe0f191	plumb the shared state through the api for the gc pausing is quite awkward.	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f564b66f21	shared state sketch	2024-07-26 08:52:55 +00:00
Joonas Koivunen	2e58ccee78	temp: planning	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f398ab0264	completion: Debug and query for barrier connection	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f23ee2ccdb	doc(test): be more accurate	2024-07-26 08:52:55 +00:00
Joonas Koivunen	0ad31bb7fb	doc: remove obsolete FIXME this was cleared with partial metadata updates.	2024-07-26 08:52:55 +00:00
Joonas Koivunen	86f26d0918	chore: minor rename FIXME in IndexPart	2024-07-26 08:52:55 +00:00
Joonas Koivunen	4a562dff2e	doc: more	2024-07-26 08:52:55 +00:00
Joonas Koivunen	f9185b42a9	doc: minor enhancements	2024-07-26 08:52:55 +00:00
Joonas Koivunen	d4f30daa81	chore: minor indentation problem	2024-07-26 08:52:55 +00:00
Joonas Koivunen	97ab53e826	chore: add std::fmt::Debug for Barrier	2024-07-26 08:52:55 +00:00