dnm: test

2026-05-25 09:00:37 +00:00 · 2024-07-31 17:24:08 +03:00
206 changed files with 3904 additions and 11256 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,9 +8,7 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - BENCHMARK_PROJECT_ID_PUB
-  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_CONTAINER_NEW
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
-  - DEV_AWS_OIDC_ROLE_ARN
--- a/.github/actions/set-docker-config-dir/action.yml
+++ b/.github/actions/set-docker-config-dir/action.yml
@@ -1,36 +0,0 @@
-name: "Set custom docker config directory"
-description: "Create a directory for docker config and set DOCKER_CONFIG"
-
-# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-runs:
-  using: "composite"
-  steps:
-  - name: Show warning on GitHub-hosted runners
-    if: runner.environment == 'github-hosted'
-    shell: bash -euo pipefail {0}
-    run: |
-      # Using the following environment variables to find a path to the workflow file
-      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
-      # ${GITHUB_REPOSITORY}   - octocat/hello-world
-      # ${GITHUB_REF}          - refs/heads/my_branch
-      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
-
-      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
-      filename=${filename_with_ref%"@$GITHUB_REF"}
-
-      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
-      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
-      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
-      echo "::warning file=${filename},title=${title}::${message}"
-
-  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
-    env:
-      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
-    with:
-      main: |
-        mkdir -p "${DOCKER_CONFIG}"
-        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
-      post: |
-        if [ -d "${DOCKER_CONFIG}" ]; then
-          rm -r "${DOCKER_CONFIG}"
-        fi
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -1,152 +0,0 @@
-name: Prepare benchmarking databases by restoring dumps
-
-on:
-  workflow_call:
-    # no inputs needed
-    
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-jobs:
-  setup-databases:
-    strategy:
-      fail-fast: false
-      matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] 
-        database: [ clickbench, tpch, userexample ]
-  
-    env:
-      LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
-      PLATFORM: ${{ matrix.platform }}
-      PG_BINARIES: /tmp/neon/pg_install/v16/bin
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - name: Set up Connection String
-      id: set-up-prep-connstr
-      run: |
-        case "${PLATFORM}" in
-          neon)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} 
-            ;;
-          aws-rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
-            ;;
-          aws-aurora-serverless-v2-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} 
-            ;;
-          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}"
-            exit 1
-            ;;
-        esac
-
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    # we create a table that has one row for each database that we want to restore with the status whether the restore is done    
-    - name: Create benchmark_restore_status table if it does not exist
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      # to avoid a race condition of multiple jobs trying to create the table at the same time, 
-      # we use an advisory lock
-      run: |
-        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
-        SELECT pg_advisory_lock(4711);  
-        CREATE TABLE IF NOT EXISTS benchmark_restore_status (
-        databasename text primary key,
-        restore_done boolean
-        );
-        SELECT pg_advisory_unlock(4711);
-        "
-    
-    - name: Check if restore is already done
-      id: check-restore-done
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        skip=false
-        if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then
-          echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database."
-          skip=true
-        fi
-        echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
-
-    - name: Check and create database if it does not exist
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'")
-        if [ "$DB_EXISTS" != "1" ]; then
-          echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..."
-          ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";"
-        else
-          echo "Database ${{ env.DATABASE_NAME }} already exists."
-        fi
-
-    - name: Download dump from S3 to /tmp/dumps
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        mkdir -p /tmp/dumps
-        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ 
-
-    - name: Replace database name in connection string
-      if: steps.check-restore-done.outputs.skip != 'true'
-      id: replace-dbname
-      env:
-        DATABASE_NAME: ${{ matrix.database }}
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-      run: |
-        # Extract the part before the database name
-        base_connstr="${BENCHMARK_CONNSTR%/*}"
-        # Extract the query parameters (if any) after the database name
-        query_params="${BENCHMARK_CONNSTR#*\?}"
-        # Reconstruct the new connection string
-        if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then
-          new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}"
-        else
-          new_connstr="${base_connstr}/${DATABASE_NAME}"
-        fi
-        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT  
-
-    - name: Restore dump
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        DATABASE_NAME: ${{ matrix.database }}
-        DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
-        # the following works only with larger computes: 
-        # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
-        # we add the || true because:
-        # the dumps were created with Neon and contain neon extensions that are not 
-        # available in RDS, so we will always report an error, but we can ignore it
-      run: |
-        ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
-        -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true
-
-    - name: Update benchmark_restore_status table
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
-        INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true)
-        ON CONFLICT (databasename) DO UPDATE SET restore_done = true;
-        "
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -223,9 +223,9 @@ jobs:
          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV_NEW }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV_NEW }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER_NEW }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,10 +56,6 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    permissions:
-      contents: write
-      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
    strategy:
      fail-fast: false
      matrix:
@@ -67,13 +63,9 @@ jobs:
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-            RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
-            RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -84,21 +76,14 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: ${{ matrix.RUNNER }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.IMAGE }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
    - uses: actions/checkout@v4

-    - name: Configure AWS credentials # necessary on Azure runners
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
-        role-duration-seconds: 18000 # 5 hours
-
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -162,7 +147,7 @@ jobs:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -176,7 +161,6 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -184,7 +168,7 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    - name: Run Logical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -192,15 +176,12 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
-        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}

-    - name: Run Physical Replication benchmarks
+    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -253,9 +234,6 @@ jobs:
      id: pgbench-compare-matrix
      run: |
        region_id_default=${{ env.DEFAULT_REGION_ID }}
-        runner_default='["self-hosted", "us-east-2", "x64"]'
-        runner_azure='["self-hosted", "eastus2", "x64"]'
-        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
        matrix='{
          "pg_version" : [
            16
@@ -269,20 +247,16 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "runner": ['"$runner_default"'],
-          "image": [ "'"$image_default"'" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

-        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                                                     { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -322,17 +296,9 @@ jobs:

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

-  prepare_AWS_RDS_databases:
-    uses: ./.github/workflows/_benchmarking_preparation.yml
-    secrets: inherit
-  
  pgbench-compare:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    needs: [ generate-matrices, prepare_AWS_RDS_databases ]
-    permissions:
-      contents: write
-      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
+    needs: [ generate-matrices ]

    strategy:
      fail-fast: false
@@ -348,9 +314,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: ${{ matrix.runner }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.image }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -359,13 +325,6 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    - name: Configure AWS credentials # necessary on Azure runners
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
-        
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -473,20 +432,12 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
-    permissions:
-      contents: write
-      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
    strategy:
      fail-fast: false
      matrix:
        include:
          - PLATFORM: "neonvm-captest-pgvector"
-            RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - PLATFORM: "azure-captest-pgvector"
-            RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -499,9 +450,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: ${{ matrix.RUNNER }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.IMAGE }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -512,12 +463,12 @@ jobs:
    - name: Install postgresql-16 where pytest expects it
      run: |
        cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
        mkdir -p /tmp/neon/pg_install/v16/bin
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
@@ -542,13 +493,6 @@ jobs:
        esac

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        
-    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours

    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
@@ -577,7 +521,7 @@ jobs:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
@@ -600,7 +544,7 @@ jobs:
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
+    needs: [ generate-matrices, pgbench-compare ]

    strategy:
      fail-fast: false
@@ -608,7 +552,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -660,7 +604,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -690,7 +633,7 @@ jobs:
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
+    needs: [ generate-matrices, clickbench-compare ]

    strategy:
      fail-fast: false
@@ -698,7 +641,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -730,7 +673,7 @@ jobs:
            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          rds-postgres)
-            ENV_PLATFORM=RDS_POSTGRES_TPCH
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -756,7 +699,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -778,7 +720,7 @@ jobs:

  user-examples-compare:
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
+    needs: [ generate-matrices, tpch-compare ]

    strategy:
      fail-fast: false
@@ -786,7 +728,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -56,7 +56,13 @@ jobs:

      - uses: actions/checkout@v4

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p /tmp/.docker-custom
+          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
+
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -83,6 +89,11 @@ jobs:
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf /tmp/.docker-custom
+
  merge-images:
    needs: [ build-image ]
    runs-on: ubuntu-22.04
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -309,7 +309,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -484,7 +484,12 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -516,6 +521,11 @@ jobs:
          tags: |
            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  neon-image:
    needs: [ neon-image-arch, tag ]
    runs-on: ubuntu-22.04
@@ -560,7 +570,12 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -643,6 +658,11 @@ jobs:
          tags: |
            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
    runs-on: ubuntu-22.04
@@ -715,7 +735,13 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -738,6 +764,11 @@ jobs:
        run: |
          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
    strategy:
@@ -753,7 +784,13 @@ jobs:
        with:
          fetch-depth: 0

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -793,6 +830,11 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml logs || 0
          docker compose -f ./docker-compose/docker-compose.yml down

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  promote-images:
    permissions:
      contents: read  # This is required for actions/checkout
@@ -829,7 +871,7 @@ jobs:
        with:
          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

      - name: Login to ACR
        if: github.ref_name == 'main'
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -1,35 +0,0 @@
-name: Add `external` label to issues and PRs created by external users
-
-on:
-  issues:
-    types:
-      - opened
-  pull_request:
-    types:
-      - opened
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-env:
-  LABEL: external
-
-jobs:
-  add-label:
-    # This workflow uses `author_association` for PRs and issues to determine if the user is an external user.
-    # Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation
-    if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }}
-
-    runs-on: ubuntu-22.04
-    permissions:
-      pull-requests: write
-      issues: write
-
-    steps:
-    - name: Label new ${{ github.event_name }}
-      env:
-        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
-        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
-      run: |
-        gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -149,6 +149,8 @@ jobs:

    env:
      BUILD_TYPE: release
+      # remove the cachepot wrapper and build without crate caches
+      RUSTC_WRAPPER: ""
      # build with incremental compilation produce partial results
      # so do not attempt to cache this build, also disable the incremental compilation
      CARGO_INCREMENTAL: 0
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,31 +66,7 @@ jobs:
        ports:
          - 9000:9000
          - 8123:8123
-      zookeeper:
-        image: quay.io/debezium/zookeeper:2.7
-        ports:
-          - 2181:2181
-      kafka:
-        image: quay.io/debezium/kafka:2.7
-        env:
-          ZOOKEEPER_CONNECT: "zookeeper:2181"
-          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
-          KAFKA_BROKER_ID: 1
-          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
-          KAFKA_JMX_PORT: 9991
-        ports:
-          - 9092:9092
-      debezium:
-        image: quay.io/debezium/connect:2.7
-        env:
-          BOOTSTRAP_SERVERS: kafka:9092
-          GROUP_ID: 1
-          CONFIG_STORAGE_TOPIC: debezium-config
-          OFFSET_STORAGE_TOPIC: debezium-offset
-          STATUS_STORAGE_TOPIC: debezium-status
-          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
-        ports:
-          - 8083:8083
+
    steps:
      - uses: actions/checkout@v4

--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -7,20 +7,12 @@ on:
        description: 'Source tag'
        required: true
        type: string
-      force:
-        description: 'Force the image to be pinned'
-        default: false
-        type: boolean
  workflow_call:
    inputs:
      from-tag:
        description: 'Source tag'
        required: true
        type: string
-      force:
-        description: 'Force the image to be pinned'
-        default: false
-        type: boolean

 defaults:
  run:
@@ -30,18 +22,15 @@ concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
  cancel-in-progress: false

-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}

-env:
-  FROM_TAG: ${{ inputs.from-tag }}
-  TO_TAG: pinned
-
 jobs:
-  check-manifests:
+  tag-image:
    runs-on: ubuntu-22.04
-    outputs:
-      skip: ${{ steps.check-manifests.outputs.skip }}
+
+    env:
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: pinned

    steps:
      - name: Check if we really need to pin the image
@@ -58,44 +47,27 @@ jobs:

          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT

-  tag-image:
-    needs: check-manifests
-
-    # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
-    if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
-
-    runs-on: ubuntu-22.04
-
-    permissions:
-      id-token: write # for `azure/login`
-
-    steps:
      - uses: docker/login-action@v3
-
+        if: steps.check-manifests.outputs.skip == 'false'
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
+
      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+        if: steps.check-manifests.outputs.skip == 'false'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
-                                          -t neondatabase/build-tools:${TO_TAG} \
                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -13,6 +13,8 @@ defaults:
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  cancel-previous-e2e-tests:
@@ -62,35 +64,19 @@ jobs:
    needs: [ tag ]
    runs-on: ubuntu-22.04
    env:
-      EVENT_ACTION: ${{ github.event.action }}
-      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
-      - name: Wait for `promote-images` job to finish
-        # It's important to have a timeout here, the script in the step can run infinitely
-        timeout-minutes: 60
+      - name: check if ecr image are present
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
-          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
-            exit 0
-          fi
-
-          # For PRs we use the run id as the tag
-          BUILD_AND_TEST_RUN_ID=${TAG}
-          while true; do
-            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
-            case "$conclusion" in
-              success)
-                break
-                ;;
-              failure | cancelled | skipped)
-                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
-                exit 1
-                ;;
-              *)
-                echo "The 'promote-images' hasn't succeed yet. Waiting..."
-                sleep 60
-                ;;
-            esac
+          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
+            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
+            if [ "$OUTPUT" == "" ]; then
+              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
+              exit 1
+            fi
          done

      - name: Set e2e-platforms
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1418,7 +1418,7 @@ dependencies = [
 "clap",
 "criterion-plot",
 "is-terminal",
- "itertools 0.10.5",
+ "itertools",
 "num-traits",
 "once_cell",
 "oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
 "cast",
- "itertools 0.10.5",
+ "itertools",
 ]

 [[package]]
@@ -2134,12 +2134,6 @@ dependencies = [
 "slab",
 ]

-[[package]]
-name = "gen_ops"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
-
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2716,6 +2710,17 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"

+[[package]]
+name = "io-lifetimes"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "io-uring"
 version = "0.6.2"
@@ -2734,13 +2739,14 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"

 [[package]]
 name = "is-terminal"
-version = "0.4.12"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
 "hermit-abi",
- "libc",
- "windows-sys 0.52.0",
+ "io-lifetimes",
+ "rustix 0.37.25",
+ "windows-sys 0.48.0",
 ]

 [[package]]
@@ -2752,15 +2758,6 @@ dependencies = [
 "either",
 ]

-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itoa"
 version = "1.0.6"
@@ -2875,6 +2872,18 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"

+[[package]]
+name = "linux-raw-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -2992,7 +3001,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
- "procfs",
+ "procfs 0.16.0",
 ]

 [[package]]
@@ -3037,7 +3046,7 @@ dependencies = [
 "measured",
 "measured-process",
 "once_cell",
- "procfs",
+ "procfs 0.14.2",
 "prometheus",
 "rand 0.8.5",
 "rand_distr",
@@ -3566,7 +3575,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "leaky-bucket",
 "md5",
 "metrics",
@@ -3584,9 +3593,8 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs",
+ "procfs 0.14.2",
 "rand 0.8.5",
- "range-set-blaze",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
@@ -3637,7 +3645,7 @@ dependencies = [
 "hex",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
@@ -3695,7 +3703,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -3960,7 +3968,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3973,7 +3981,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3992,7 +4000,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4027,7 +4035,7 @@ name = "postgres_connection"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "postgres",
 "tokio-postgres",
@@ -4085,7 +4093,7 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools 0.10.5",
+ "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -4131,6 +4139,21 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "procfs"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
+dependencies = [
+ "bitflags 1.3.2",
+ "byteorder",
+ "chrono",
+ "flate2",
+ "hex",
+ "lazy_static",
+ "rustix 0.36.16",
+]
+
 [[package]]
 name = "procfs"
 version = "0.16.0"
@@ -4138,12 +4161,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
- "flate2",
 "hex",
 "lazy_static",
 "procfs-core",
- "rustix",
+ "rustix 0.38.28",
 ]

 [[package]]
@@ -4153,15 +4174,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
 "bitflags 2.4.1",
- "chrono",
 "hex",
 ]

 [[package]]
 name = "prometheus"
-version = "0.13.4"
+version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
 dependencies = [
 "cfg-if",
 "fnv",
@@ -4169,7 +4189,7 @@ dependencies = [
 "libc",
 "memchr",
 "parking_lot 0.12.1",
- "procfs",
+ "procfs 0.14.2",
 "thiserror",
 ]

@@ -4191,7 +4211,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
 "bytes",
 "heck 0.4.1",
- "itertools 0.10.5",
+ "itertools",
 "lazy_static",
 "log",
 "multimap",
@@ -4212,7 +4232,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools",
 "proc-macro2",
 "quote",
 "syn 1.0.109",
@@ -4269,7 +4289,7 @@ dependencies = [
 "hyper-util",
 "indexmap 2.0.1",
 "ipnet",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "md5",
 "measured",
@@ -4324,7 +4344,6 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
- "try-lock",
 "typed-json",
 "url",
 "urlencoding",
@@ -4446,18 +4465,6 @@ dependencies = [
 "rand_core 0.5.1",
 ]

-[[package]]
-name = "range-set-blaze"
-version = "0.1.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
-dependencies = [
- "gen_ops",
- "itertools 0.12.1",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -4626,7 +4633,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "metrics",
 "once_cell",
 "pin-project-lite",
@@ -4936,6 +4943,34 @@ dependencies = [
 "nom",
 ]

+[[package]]
+name = "rustix"
+version = "0.36.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.1.4",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "rustix"
 version = "0.38.28"
@@ -5695,7 +5730,7 @@ dependencies = [
 "hex",
 "humantime",
 "hyper 0.14.26",
- "itertools 0.10.5",
+ "itertools",
 "lasso",
 "measured",
 "metrics",
@@ -5704,7 +5739,6 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "r2d2",
- "rand 0.8.5",
 "reqwest 0.12.4",
 "routerify",
 "scopeguard",
@@ -5760,10 +5794,9 @@ dependencies = [
 "either",
 "futures",
 "futures-util",
- "git-version",
 "hex",
 "humantime",
- "itertools 0.10.5",
+ "itertools",
 "once_cell",
 "pageserver",
 "pageserver_api",
@@ -5940,15 +5973,15 @@ dependencies = [

 [[package]]
 name = "tempfile"
-version = "3.9.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
 dependencies = [
 "cfg-if",
- "fastrand 2.0.0",
- "redox_syscall 0.4.1",
- "rustix",
- "windows-sys 0.52.0",
+ "fastrand 1.9.0",
+ "redox_syscall 0.3.5",
+ "rustix 0.37.25",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -6187,7 +6220,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6564,9 +6597,9 @@ dependencies = [

 [[package]]
 name = "try-lock"
-version = "0.2.5"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"

 [[package]]
 name = "tungstenite"
@@ -7145,6 +7178,15 @@ dependencies = [
 "windows_x86_64_msvc 0.42.2",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7163,6 +7205,21 @@ dependencies = [
 "windows-targets 0.52.4",
 ]

+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
@@ -7392,7 +7449,7 @@ dependencies = [
 "hmac",
 "hyper 0.14.26",
 "indexmap 1.9.3",
- "itertools 0.10.5",
+ "itertools",
 "libc",
 "log",
 "memchr",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.16"
+procfs = "0.14"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
@@ -184,7 +184,6 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
-try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
--- a/21
+++ b/21
@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh

-ENV BUILD_TYPE=release
+ENV BUILD_TYPE release
 RUN set -e \
    && mold -run make -j $(nproc) -s neon-pg-ext \
    && rm -rf pg_install/build \
@@ -29,12 +29,24 @@ WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG

+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
+ARG RUSTC_WRAPPER=cachepot
+ENV AWS_REGION=eu-central-1
+ENV CACHEPOT_S3_KEY_PREFIX=cachepot
+ARG CACHEPOT_BUCKET=neon-github-dev
+#ARG AWS_ACCESS_KEY_ID
+#ARG AWS_SECRET_ACCESS_KEY
+
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .

+# Show build caching stats to check if it was used in the end.
+# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
      --bin pg_sni_router  \
@@ -46,7 +58,8 @@ RUN set -e \
      --bin proxy  \
      --bin neon_local \
      --bin storage_scrubber \
-      --locked --release
+      --locked --release \
+    && cachepot -s

 # Build final image
 #
@@ -91,7 +104,7 @@ RUN mkdir -p /data/.neon/ && \

 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH=/usr/local/v16/lib
+ENV LD_LIBRARY_PATH /usr/local/v16/lib


 VOLUME ["/data"]
@@ -99,5 +112,5 @@ USER neon
 EXPOSE 6400
 EXPOSE 9898

-CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
+CMD /usr/local/bin/pageserver -D /data/.neon

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,7 +58,7 @@ RUN set -e \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION=25.1
+ENV PROTOC_VERSION 25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
    && unzip -q protoc.zip -d protoc \
    && mv protoc/bin/protoc /usr/local/bin/protoc \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.33.0
+ENV MOLD_VERSION v2.31.0
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
@@ -168,7 +168,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot

 # Python
-ENV PYTHON_VERSION=3.9.19 \
+ENV PYTHON_VERSION=3.9.18 \
    PYENV_ROOT=/home/nonroot/.pyenv \
    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -192,14 +192,9 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.1
+ENV RUSTC_VERSION=1.80.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
-ARG RUSTFILT_VERSION=0.2.1
-ARG CARGO_HAKARI_VERSION=0.9.30
-ARG CARGO_DENY_VERSION=0.16.1
-ARG CARGO_HACK_VERSION=0.6.31
-ARG CARGO_NEXTEST_VERSION=0.9.72
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -208,13 +203,15 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
    rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
-    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
-    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
-    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
-    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install --git https://github.com/paritytech/cachepot && \
+    cargo install rustfilt && \
+    cargo install cargo-hakari && \
+    cargo install cargo-deny --locked && \
+    cargo install cargo-hack && \
+    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git
+ENV RUSTC_WRAPPER=cachepot

 # Show versions
 RUN whoami \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

-ENV PATH="/usr/local/pgsql/bin:$PATH"
+ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
+ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
+ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14") \
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -506,7 +506,7 @@ RUN apt-get update && \
        libboost-system1.74-dev \
        libeigen3-dev

-ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 FROM build-deps AS pg-semver-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
        export PG_EMBEDDING_VERSION=0.3.5 \
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 FROM build-deps AS wal2json-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 FROM build-deps AS pg-ivm-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 FROM build-deps AS pg-partman-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -933,8 +933,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -946,7 +945,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -961,7 +960,6 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
@@ -1034,6 +1032,6 @@ RUN apt update &&  \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

-ENV LANG=en_US.utf8
+ENV LANG en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/README.md
+++ b/README.md
@@ -313,3 +313,5 @@ To get more familiar with this aspect, refer to:
 - Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices.
 - To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md).
 - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html
+
+.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,11 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test specific features.
-testing = []
-
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,15 +400,7 @@ impl ComputeNode {
    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let mut retry_period_ms = 500.0;
        let mut attempts = 0;
-        const DEFAULT_ATTEMPTS: u16 = 10;
-        #[cfg(feature = "testing")]
-        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
-            u16::from_str(&v).unwrap()
-        } else {
-            DEFAULT_ATTEMPTS
-        };
-        #[cfg(not(feature = "testing"))]
-        let max_attempts = DEFAULT_ATTEMPTS;
+        let max_attempts = 10;
        loop {
            let result = self.try_get_basebackup(compute_state, lsn);
            match result {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {

 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_") {
+        if var.starts_with("NEON_PAGESERVER_") {
            cmd = cmd.env(var, val);
        }
    }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -824,12 +824,11 @@ impl Endpoint {
        // cleanup work to do after postgres stops, like syncing safekeepers,
        // etc.
        //
-        // If destroying or stop mode is immediate, send it SIGTERM before
-        // waiting. Sometimes we do *not* want this cleanup: tests intentionally
-        // do stop when majority of safekeepers is down, so sync-safekeepers
-        // would hang otherwise. This could be a separate flag though.
-        let send_sigterm = destroy || mode == "immediate";
-        self.wait_for_compute_ctl_to_exit(send_sigterm)?;
+        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
+        // want this cleanup: tests intentionally do stop when majority of
+        // safekeepers is down, so sync-safekeepers would hang otherwise. This
+        // could be a separate flag though.
+        self.wait_for_compute_ctl_to_exit(destroy)?;
        if destroy {
            println!(
                "Destroying postgres data directory '{}'",
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -158,8 +158,6 @@ pub struct NeonStorageControllerConf {

    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,
-
-    pub max_secondary_lag_bytes: Option<u64>,
 }

 impl NeonStorageControllerConf {
@@ -175,7 +173,6 @@ impl Default for NeonStorageControllerConf {
            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
            split_threshold: None,
-            max_secondary_lag_bytes: None,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -383,10 +383,6 @@ impl StorageController {
            args.push(format!("--split-threshold={split_threshold}"))
        }

-        if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
-            args.push(format!("--max-secondary-lag-bytes={lag}"))
-        }
-
        args.push(format!(
            "--neon-local-repo-dir={}",
            self.env.base_data_dir.display()
--- a/deny.toml
+++ b/deny.toml
@@ -4,7 +4,6 @@
 # to your expectations and requirements.

 # Root options
-[graph]
 targets = [
    { triple = "x86_64-unknown-linux-gnu" },
    { triple = "aarch64-unknown-linux-gnu" },
@@ -13,7 +12,6 @@ targets = [
 ]
 all-features = false
 no-default-features = false
-[output]
 feature-depth = 1

 # This section is considered when running `cargo deny check advisories`
@@ -21,13 +19,17 @@ feature-depth = 1
 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
+vulnerability = "deny"
+unmaintained = "warn"
 yanked = "warn"
+notice = "warn"
 ignore = []

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
+unlicensed = "deny"
 allow = [
    "Apache-2.0",
    "Artistic-2.0",
@@ -40,6 +42,10 @@ allow = [
    "OpenSSL",
    "Unicode-DFS-2016",
 ]
+deny = []
+copyleft = "warn"
+allow-osi-fsf-free = "neither"
+default = "deny"
 confidence-threshold = 0.8
 exceptions = [
    # Zlib license has some restrictions if we decide to change sth
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
            cleanup
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x

-cd /ext-src || exit 2
+cd /ext-src
 FAILED=
-LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d "${d}" ] || continue
+       [ -d ${d} ] || continue
    psql -c "select 1" >/dev/null || break
-       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo "${FAILED}"
+echo ${FAILED}
 exit 1
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -1,18 +1,13 @@
 # Summary

-# Looking for `neon.tech` docs?
-
-This page linkes to a selection of technical content about the open source code in this repository.
-
-Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
-in this repository.
-
-# Architecture
-
 [Introduction]()
 - [Separation of Compute and Storage](./separation-compute-storage.md)

+# Architecture
+
 - [Compute]()
+  - [WAL proposer]()
+  - [WAL Backpressure]()
  - [Postgres changes](./core_changes.md)

 - [Pageserver](./pageserver.md)
@@ -21,15 +16,33 @@ in this repository.
    - [WAL Redo](./pageserver-walredo.md)
    - [Page cache](./pageserver-pagecache.md)
    - [Storage](./pageserver-storage.md)
+        - [Datadir mapping]()
+        - [Layer files]()
+        - [Branching]()
+        - [Garbage collection]()
+    - [Cloud Storage]()
    - [Processing a GetPage request](./pageserver-processing-getpage.md)
    - [Processing WAL](./pageserver-processing-wal.md)
+	- [Management API]()
+	- [Tenant Rebalancing]()

 - [WAL Service](walservice.md)
  - [Consensus protocol](safekeeper-protocol.md)
+  - [Management API]()
+  - [Rebalancing]()
+
+- [Control Plane]()
+
+- [Proxy]()

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
  - [Error handling and logging](./error-handling.md)
+  - [Testing]()
+    - [Unit testing]()
+    - [Integration testing]()
+    - [Benchmarks]()
+

 - [Glossary](./glossary.md)

@@ -45,6 +58,28 @@ in this repository.

 # RFCs

-Major changes are documented in RFCS:
- See [RFCs](./rfcs/README.md) for more information
- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs
+- [RFCs](./rfcs/README.md)
+
+- [002-storage](rfcs/002-storage.md)
+- [003-laptop-cli](rfcs/003-laptop-cli.md)
+- [004-durability](rfcs/004-durability.md)
+- [005-zenith_local](rfcs/005-zenith_local.md)
+- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
+- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
+- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
+- [008-push-pull](rfcs/008-push-pull.md)
+- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
+- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
+- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
+- [010-storage_details](rfcs/010-storage_details.md)
+- [011-retention-policy](rfcs/011-retention-policy.md)
+- [012-background-tasks](rfcs/012-background-tasks.md)
+- [013-term-history](rfcs/013-term-history.md)
+- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
+- [014-storage-lsm](rfcs/014-storage-lsm.md)
+- [015-storage-messaging](rfcs/015-storage-messaging.md)
+- [016-connection-routing](rfcs/016-connection-routing.md)
+- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
+- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
+- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
+- [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -1,495 +0,0 @@
-# Safekeeper dynamic membership change
-
-To quickly recover from safekeeper node failures and do rebalancing we need to
-be able to change set of safekeepers the timeline resides on. The procedure must
-be safe (not lose committed log) regardless of safekeepers and compute state. It
-should be able to progress if any majority of old safekeeper set, any majority
-of new safekeeper set and compute are up and connected. This is known as a
-consensus membership change. It always involves two phases: 1) switch old
-majority to old + new configuration, preventing commits without acknowledge from
-the new set 2) bootstrap the new set by ensuring majority of the new set has all
-data which ever could have been committed before the first phase completed;
-after that switch is safe to finish. Without two phases switch to the new set
-which quorum might not intersect with quorum of the old set (and typical case of
-ABC -> ABD switch is an example of that, because quorums AC and BD don't
-intersect). Furthermore, procedure is typically carried out by the consensus
-leader, and so enumeration of configurations which establishes order between
-them is done through consensus log.
-
-In our case consensus leader is compute (walproposer), and we don't want to wake
-up all computes for the change. Neither we want to fully reimplement the leader
-logic second time outside compute. Because of that the proposed algorithm relies
-for issuing configurations on the external fault tolerant (distributed) strongly
-consisent storage with simple API: CAS (compare-and-swap) on the single key.
-Properly configured postgres suits this.
-
-In the system consensus is implemented at the timeline level, so algorithm below
-applies to the single timeline.
-
-## Algorithm
-
-### Definitions
-
-A configuration is
-
-```
-struct Configuration {
-    generation: Generation, // a number uniquely identifying configuration
-    sk_set: Vec<NodeId>, // current safekeeper set
-    new_sk_set: Optional<Vec<NodeId>>,
-}
-```
-
-Configuration with `new_set` present is used for the intermediate step during
-the change and called joint configuration. Generations establish order of
-generations: we say `c1` is higher than `c2` if `c1.generation` >
-`c2.generation`.
-
-### Persistently stored data changes
-
-Safekeeper starts storing its current configuration in the control file. Update
-of is atomic, so in-memory value always matches the persistent one.
-
-External CAS providing storage (let's call it configuration storage here) also
-stores configuration for each timeline. It is initialized with generation 1 and
-initial set of safekeepers during timeline creation. Executed CAS on it must
-never be lost.
-
-### Compute <-> safekeeper protocol changes
-
-`ProposerGreeting` message carries walproposer's configuration if it is already
-established (see below), else null.  `AcceptorGreeting` message carries
-safekeeper's current `Configuration`. All further messages (`VoteRequest`,
-`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry
-generation number, of walproposer in case of wp->sk message or of safekeeper in
-case of sk->wp message.
-
-### Safekeeper changes
-
-Basic rule: once safekeeper observes configuration higher than his own it
-immediately switches to it. It must refuse all messages with lower generation
-that his. It also refuses messages if it is not member of the current generation
-(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to
-process them (walproposer should ignore result anyway).
-
-If there is non null configuration in `ProposerGreeting` and it is higher than
-current safekeeper one, safekeeper switches to it.
-
-Safekeeper sends its current configuration in its first message to walproposer
-`AcceptorGreeting`. It refuses all other walproposer messages if the
-configuration generation in them is less than its current one. Namely, it
-refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
-response it sends its current configuration generation to let walproposer know.
-
-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
-accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
-current one and ignores it otherwise. In any case it replies with
-```
-struct ConfigurationSwitchResponse {
-    conf: Configuration,
-    term: Term,
-    last_log_term: Term,
-    flush_lsn: Lsn,
-}
-```
-
-### Compute (walproposer) changes
-
-Basic rule is that joint configuration requires votes from majorities in the
-both `set` and `new_sk_set`.
-
-Compute receives list of safekeepers to connect to from the control plane as
-currently and tries to communicate with all of them. However, the list does not
-define consensus members. Instead, on start walproposer tracks highest
-configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
-from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
-establishes this configuration as its own and moves to voting. 
-
-It should stop talking to safekeepers not listed in the configuration at this
-point, though it is not unsafe to continue doing so.
-
-To be elected it must receive votes from both majorites if `new_sk_set` is present.
-Similarly, to commit WAL it must receive flush acknowledge from both majorities.
-
-If walproposer hears from safekeeper configuration higher than his own (i.e.
-refusal to accept due to configuration change) it simply restarts.
-
-### Change algorithm
-
-The following algorithm can be executed anywhere having access to configuration
-storage and safekeepers. It is safe to interrupt / restart it and run multiple
-instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<NodeId>` as input. 
-
-Algorithm will refuse to make the change if it encounters previous interrupted
-change attempt, but in this case it will try to finish it.
-
-It will eventually converge if old majority, new majority and configuration
-storage are reachable.
-
-1) Fetch current timeline configuration from the configuration storage.
-2) If it is already joint one and `new_set` is different from `desired_set`
-   refuse to change. However, assign join conf to (in memory) var
-   `join_conf` and proceed to step 4 to finish the ongoing change.
-3) Else, create joint `joint_conf: Configuration`: increment current conf number
-   `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
-   storage by doing CAS on the current generation: change happens only if
-   current configuration number is still `n`. Apart from guaranteeing uniqueness
-   of configurations, CAS linearizes them, ensuring that new configuration is
-   created only following the previous one when we know that the transition is
-   safe. Failed CAS aborts the procedure.
-4) Call `PUT` `configuration` on safekeepers from the current set,
-   delivering them `joint_conf`. Collecting responses from majority is required
-   to proceed. If any response returned generation higher than 
-   `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
-   max `<last_log_term, flush_lsn>` among responses and establish it as
-   (in memory) `sync_position`. Also choose max `term` and establish it as (in
-   memory) `sync_term`. We can't finish the switch until majority of the new set
-   catches up to this `sync_position` because data before it could be committed
-   without ack from the new set. Similarly, we'll bump term on new majority
-   to `sync_term` so that two computes with the same term are never elected.
-4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
-   doesn't exist yet by doing `pull_timeline` from the majority of the 
-   current set. Doing that on majority of `new_sk_set` is enough to
-   proceed, but it is reasonable to ensure that all `new_sk_set` members
-   are initialized -- if some of them are down why are we migrating there?
-5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
-   Success on majority is enough.
-6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `joint_conf` and collecting their positions. This will
-   switch them to the `joint_conf` which generally won't be needed 
-   because `pull_timeline` already includes it and plus additionally would be
-   broadcast by compute. More importantly, we may proceed to the next step
-   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path no waiting is not needed because 
-   `pull_timeline` already includes it. However, we should double
-    check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
-   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
-   storage under one more CAS.
-8) Call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `new_conf`. It is enough to deliver it to the majority 
-   of the new set; the rest can be updated by compute.
-
-I haven't put huge effort to make the description above very precise, because it
-is natural language prone to interpretations anyway. Instead I'd like to make TLA+
-spec of it.
-
-Description above focuses on safety. To make the flow practical and live, here a few more 
-considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
-  step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
-   it is safe to rollback to the old conf with one more CAS.
-3) On step 4 timeline might be already created on members of the new set for various reasons; 
-   the simplest is the procedure restart. There are more complicated scenarious like mentioned
-   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
-   generations, so seems simpler to treat existing timeline as success. However, this also 
-   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
-   the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
-   I don't think we'll observe this in practice, but can add waking up compute if needed.
-4) In the end timeline should be locally deleted on the safekeeper(s) which are
-   in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number (deletion proceeds only if 
-   current configuration is <= than one in request and safekeeper is not memeber of it).
-5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
-   jump to step 7, using it as `new_conf`.
-
-## Implementation
-
-The procedure ought to be driven from somewhere. Obvious candidates are control
-plane and storage_controller; and as each of them already has db we don't want
-yet another storage. I propose to manage safekeepers in storage_controller
-because 1) since it is in rust it simplifies simulation testing (more on this
-below) 2) it already manages pageservers. 
-
-This assumes that migration will be fully usable only after we migrate all
-tenants/timelines to storage_controller. It is discussible whether we want also
-to manage pageserver attachments for all of these, but likely we do.
-
-This requires us to define storcon <-> cplane interface.
-
-### storage_controller <-> control plane interface
-
-First of all, control plane should
-[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
-storing safekeepers per timeline instead of per tenant because we can't migrate
-tenants atomically. 
-
-The important question is how updated configuration is delivered from
-storage_controller to control plane to provide it to computes. As always, there
-are two options, pull and push. Let's do it the same push as with pageserver
-`/notify-attach` because 1) it keeps storage_controller out of critical compute
-start path 2) provides easier upgrade: there won't be such a thing as 'timeline
-managed by control plane / storcon', cplane just takes the value out of its db
-when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
-control plane until it succeeds.
-
-So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
-updates it in the db if the provided conf generation is higher (the cplane db
-should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
-should update db which makes the call successful, and then try to schedule
-`apply_config` if possible, it is ok if not. storage_controller 
-should rate limit calling the endpoint, but likely this won't be needed, as migration
-throughput is limited by `pull_timeline`.
-
-Timeline (branch) creation in cplane should call storage_controller POST
-`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
-Response should be augmented with `safekeeper_conf: Configuration`. The call
-should be retried until succeeds.
-
-Timeline deletion and tenant deletion in cplane should call appropriate
-storage_controller endpoints like it currently does for sharded tenants. The
-calls should be retried until they succeed.
-
-### storage_controller implementation
-
-Current 'load everything on startup and keep in memory' easy design is fine.
-Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
-byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
-10^6 of timelines shouldn't take more than 100MB.
-
-Similar to pageserver attachment Intents storage_controller would have in-memory
-`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
-to make these request reality; this ensures one instance of storage_controller
-won't do several migrations on the same timeline concurrently. In the first
-version it is simpler to have more manual control and no retries, i.e. migration
-failure removes the request. Later we can build retries and automatic
-scheduling/migration. `MigrationRequest` is
-```
-enum MigrationRequest {
-    To(Vec<NodeId>),
-    FinishPending,
-}
-```
-
-`FinishPending` requests to run the procedure to ensure state is clean: current
-configuration is not joint and majority of safekeepers are aware of it, but do
-not attempt to migrate anywhere. If current configuration fetched on step 1 is
-not joint it jumps to step 7. It should be run at startup for all timelines (but
-similarly, in the first version it is ok to trigger it manually).
-
-#### Schema
-
-`safekeepers` table mirroring current `nodes` should be added, except that for
-`scheduling_policy` field (seems like `status` is a better name for it): it is enough
-to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
-`decomissioned`.
-
-`timelines` table:
-```
-table! {
-    // timeline_id is primary key
-    timelines (tenant_id, timeline_id) {
-        timeline_id -> Varchar,
-        tenant_id -> Varchar,
-        generation -> Int4,
-        sk_set -> Array<Int4>, // list of safekeeper ids
-        new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
-        cplane_notified_generation -> Int4,
-    }
-}
-```
-
-#### API
-
-Node management is similar to pageserver:
-1) POST `/control/v1/safekeepers` upserts safekeeper.
-2) GET `/control/v1/safekeepers` lists safekeepers.
-3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
-4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
-   `offline` or `decomissioned`. Initially it is simpler not to schedule any
-    migrations here.
-
-Safekeeper deploy scripts should register safekeeper at storage_contorller as
-they currently do with cplane, under the same id.
-
-Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
-would 1) choose initial set of safekeepers; 2) write to the db initial
-`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
-case of conflict; 3) create timeline on the majority of safekeepers (already
-created is ok).
-
-We don't want to block timeline creation when one safekeeper is down. Currently
-this is solved by compute implicitly creating timeline on any safekeeper it is
-connected to. This creates ugly timeline state on safekeeper when timeline is
-created, but start LSN is not defined yet. It would be nice to remove this; to
-do that, controller can in the background retry to create timeline on
-safekeeper(s) which missed that during initial creation call. It can do that
-through `pull_timeline` from majority so it doesn't need to remember
-`parent_lsn` in its db.
-
-Timeline deletion removes the row from the db and forwards deletion to the
-current configuration members. Without additional actions deletions might leak,
-see below on this; initially let's ignore these, reporting to cplane success if
-at least one safekeeper deleted the timeline (this will remove s3 data).
-
-Tenant deletion repeats timeline deletion for all timelines.
-
-Migration API: the first version is the simplest and the most imperative:
-1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move
-all timelines from one safekeeper to another. It accepts json
-```
-{
-    "src_sk": u32,
-    "dst_sk": u32,
-    "limit": Optional<u32>,
-}
-```
-
-Returns list of scheduled requests.
-
-2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
-   to move single timeline to given set of safekeepers:
-```
-{
-    "desired_set": Vec<u32>,
-}
-```
-
-Returns scheduled request.
-
-Similar call should be added for the tenant.
-
-It would be great to have some way of subscribing to the results (apart from
-looking at logs/metrics).
-
-Migration is executed as described above. One subtlety is that (local) deletion on
-source safekeeper might fail, which is not a problem if we are going to
-decomission the node but leaves garbage otherwise. I'd propose in the first version
-1) Don't attempt deletion at all if node status is `offline`.
-2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
-remove garbage timelines for manual use. It will 1) list all timelines on the 
-safekeeper 2) compare each one against configuration storage: if timeline 
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
-be deleted under generation number if node is not member of current generation.
-
-Automating this is untrivial; we'd need to register all potential missing
-deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
-which switches configurations. Similarly when timeline is fully deleted to
-prevent cplane operation from blocking when some safekeeper is not available
-deletion should be also registered.
-
-One more task pool should infinitely retry notifying control plane about changed
-safekeeper sets.
-
-3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
-   current in memory state of the timeline and pending `MigrationRequest`,
-   if any.
-
-4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the
-   migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
-   (incrementing generation as always).
-
-#### Dealing with multiple instances of storage_controller
-
-Operations described above executed concurrently might create some errors but do
-not prevent progress, so while we normally don't want to run multiple instances
-of storage_controller it is fine to have it temporarily, e.g. during redeploy.
-
-Any interactions with db update in-memory controller state, e.g. if migration
-request failed because different one is in progress, controller remembers that
-and tries to finish it.
-
-## Testing
-
-`neon_local` should be switched to use storage_controller, playing role of
-control plane.
-
-There should be following layers of tests:
-1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety.
-
-2) To cover real code and at the same time test many schedules we should have
-   simulation tests. For that, configuration storage, storage_controller <->
-   safekeeper communication and pull_timeline need to be mocked and main switch
-   procedure wrapped to as a node (thread) in simulation tests, using these
-   mocks. Test would inject migrations like it currently injects
-   safekeeper/walproposer restars. Main assert is the same -- committed WAL must
-   not be lost.
-
-3) Since simulation testing injects at relatively high level points (not
-   syscalls), it omits some code, in particular `pull_timeline`. Thus it is
-   better to have basic tests covering whole system as well. Extended version of
-   `test_restarts_under_load` would do: start background load and do migration 
-   under it, then restart endpoint and check that no reported commits 
-   had been lost. I'd also add one more creating classic network split scenario, with
-   one compute talking to AC and another to BD while migration from nodes ABC to ABD
-   happens.
-
-4) Simple e2e test should ensure that full flow including cplane notification works.
-
-## Order of implementation and rollout
-
-Note that 
- Control plane parts and integration with it is fully independent from everything else
-  (tests would use simulation and neon_local).
- There is a lot of infra work making storage_controller aware of timelines and safekeepers
-  and its impl/rollout should be separate from migration itself.
- Initially walproposer can just stop working while it observers joint configuration.
-  Such window would be typically very short anyway.
-
-To rollout smoothly, both walproposer and safekeeper should have flag
-`configurations_enabled`; when set to false, they would work as currently, i.e.
-walproposer is able to commit on whatever safekeeper set it is provided. Until
-all timelines are managed by storcon we'd need to use current script to migrate
-and update/drop entries in the storage_controller database if it has any.
-
-Safekeepers would need to be able to talk both current and new protocol version
-with compute to reduce number of computes restarted in prod once v2 protocol is
-deployed (though before completely switching we'd need to force this).
-
-Let's have the following rollout order:
- storage_controller becomes aware of safekeepers;
- storage_controller gets timeline creation for new timelines and deletion requests, but
-  doesn't manage all timelines yet. Migration can be tested on these new timelines.
-  To keep control plane and storage_controller databases in sync while control 
-  plane still chooses the safekeepers initially (until all timelines are imported
-  it can choose better), `TimelineCreateRequest` can get optional safekeepers
-  field with safekeepers chosen by cplane.
- Then we can import all existing timelines from control plane to
-  storage_controller and gradually enable configurations region by region.
-
-
-Very rough implementation order:
- Add concept of configurations to safekeepers (including control file),
-  implement v3 protocol.
- Implement walproposer changes, including protocol.
- Implement storconn part. Use it in neon_local (and pytest).
- Make cplane store safekeepers per timeline instead of per tenant.
- Implement cplane/storcon integration. Route branch creation/deletion 
-  through storcon. Then we can test migration of new branches.
- Finally import existing branches. Then we can drop cplane 
-  safekeeper selection code. Gradually enable configurations at 
-  computes and safekeepers. Before that, all computes must talk only
-  v3 protocol version.
-
-## Integration with evicted timelines
-
-Currently, `pull_timeline` doesn't work correctly with evicted timelines because
-copy would point to original partial file. To fix let's just do s3 copy of the
-file. It is a bit stupid as generally unnecessary work, but it makes sense to
-implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542)
-
-## Possible optimizations
-
-Steps above suggest walproposer restart (with re-election) and thus reconnection
-to safekeepers. Since by bumping term on new majority we ensure that leader
-terms are unique even across generation switches it is possible to preserve
-connections. However, it is more complicated, reconnection is very fast and it
-is much more important to avoid compute restart than millisecond order of write
-stall.
-
-Multiple joint consensus: algorithm above rejects attempt to change membership
-while another attempt is in progress. It is possible to overlay them and AFAIK
-Aurora does this but similarly I don't think this is needed.
-
-## Misc
-
-We should use Compute <-> safekeeper protocol change to include other (long
-yearned) modifications:
- send data in network order to make arm work.
- remove term_start_lsn from AppendRequest
- add horizon to TermHistory
- add to ProposerGreeting number of connection from this wp to sk
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -22,11 +22,6 @@ pub struct Key {
    pub field6: u32,
 }

-/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
-/// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
-pub struct CompactKey(i128);
-
 /// The storage key size.
 pub const KEY_SIZE: usize = 18;

@@ -112,10 +107,7 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(
-            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
-            "invalid key: {self}",
-        );
+        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
@@ -135,14 +127,6 @@ impl Key {
        }
    }

-    pub fn to_compact(&self) -> CompactKey {
-        CompactKey(self.to_i128())
-    }
-
-    pub fn from_compact(k: CompactKey) -> Self {
-        Self::from_i128(k.0)
-    }
-
    pub const fn next(&self) -> Key {
        self.add(1)
    }
@@ -212,13 +196,6 @@ impl fmt::Display for Key {
    }
 }

-impl fmt::Display for CompactKey {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let k = Key::from_compact(*self);
-        k.fmt(f)
-    }
-}
-
 impl Key {
    pub const MIN: Key = Key {
        field1: u8::MIN,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,13 +637,6 @@ pub struct TenantInfo {
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
    pub generation: u32,
-
-    /// Opaque explanation if gc is being blocked.
-    ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub gc_blocking: Option<String>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -947,8 +940,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::path::PathBuf;
-
    #[derive(
        Copy,
        Clone,
@@ -967,53 +958,6 @@ pub mod virtual_file {
        #[cfg(target_os = "linux")]
        TokioEpollUring,
    }
-
-    /// Direct IO modes for a pageserver.
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-    pub enum DirectIoMode {
-        /// Direct IO disabled (uses usual buffered IO).
-        #[default]
-        Disabled,
-        /// Direct IO disabled (performs checks and perf simulations).
-        Evaluate {
-            /// Alignment check level
-            alignment_check: DirectIoAlignmentCheckLevel,
-            /// Latency padded for performance simulation.
-            latency_padding: DirectIoLatencyPadding,
-        },
-        /// Direct IO enabled.
-        Enabled {
-            /// Actions to perform on alignment error.
-            on_alignment_error: DirectIoOnAlignmentErrorAction,
-        },
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoAlignmentCheckLevel {
-        #[default]
-        Error,
-        Log,
-        None,
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoOnAlignmentErrorAction {
-        Error,
-        #[default]
-        FallbackToBuffered,
-    }
-
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "type", rename_all = "kebab-case")]
-    pub enum DirectIoLatencyPadding {
-        /// Pad virtual file operations with IO to a fake file.
-        FakeFileRW { path: PathBuf },
-        #[default]
-        None,
-    }
 }

 // Wrapped in libpq CopyData
@@ -1483,7 +1427,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1506,7 +1449,6 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
-            gc_blocking: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,8 +1,6 @@
-use std::collections::HashSet;
-
 use utils::id::TimelineId;

 #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
-    pub reparented_timelines: HashSet<TimelineId>,
+    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,5 +1,4 @@
-use std::time::SystemTime;
-use utils::{serde_percent::Percent, serde_system_time};
+use utils::serde_system_time::SystemTime;

 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -10,88 +9,19 @@ use utils::{serde_percent::Percent, serde_system_time};
 /// not handle full u64 values properly.
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
-    /// Used disk space (physical, ground truth from statfs())
+    /// Used disk space
    #[serde(serialize_with = "ser_saturating_u63")]
    pub disk_usage_bytes: u64,
    /// Free disk space
    #[serde(serialize_with = "ser_saturating_u63")]
    pub free_space_bytes: u64,
-
-    /// Wanted disk space, based on the tenant shards currently present on this pageserver: this
-    /// is like disk_usage_bytes, but it is stable and does not change with the cache state of
-    /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
-    /// there, or may be unrealistically low if the pageserver has attached tenants which haven't
-    /// downloaded layers yet.
-    #[serde(serialize_with = "ser_saturating_u63", default)]
-    pub disk_wanted_bytes: u64,
-
-    // What proportion of total disk space will this pageserver use before it starts evicting data?
-    #[serde(default = "unity_percent")]
-    pub disk_usable_pct: Percent,
-
-    // How many shards are currently on this node?
-    #[serde(default)]
-    pub shard_count: u32,
-
-    // How many shards should this node be able to handle at most?
-    #[serde(default)]
-    pub max_shard_count: u32,
-
-    /// Cached result of [`Self::score`]
+    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
+    #[serde(serialize_with = "ser_saturating_u63")]
    pub utilization_score: u64,
-
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
-    pub captured_at: serde_system_time::SystemTime,
-}
-
-fn unity_percent() -> Percent {
-    Percent::new(0).unwrap()
-}
-
-impl PageserverUtilization {
-    const UTILIZATION_FULL: u64 = 1000000;
-
-    /// Calculate a utilization score.  The result is to be inrepreted as a fraction of
-    /// Self::UTILIZATION_FULL.
-    ///
-    /// Lower values are more affine to scheduling more work on this node.
-    /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
-    /// - 0.0 represents an empty node.
-    /// - Negative values are forbidden
-    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
-    ///   layer eviction.
-    pub fn score(&self) -> u64 {
-        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
-            * self.disk_usable_pct.get() as u64)
-            / 100;
-        let disk_utilization_score =
-            self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
-
-        let shard_utilization_score =
-            self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
-        std::cmp::max(disk_utilization_score, shard_utilization_score)
-    }
-
-    pub fn refresh_score(&mut self) {
-        self.utilization_score = self.score();
-    }
-
-    /// A utilization structure that has a full utilization score: use this as a placeholder when
-    /// you need a utilization but don't have real values yet.
-    pub fn full() -> Self {
-        Self {
-            disk_usage_bytes: 1,
-            free_space_bytes: 0,
-            disk_wanted_bytes: 1,
-            disk_usable_pct: Percent::new(100).unwrap(),
-            shard_count: 1,
-            max_shard_count: 1,
-            utilization_score: Self::UTILIZATION_FULL,
-            captured_at: serde_system_time::SystemTime(SystemTime::now()),
-        }
-    }
+    pub captured_at: SystemTime,
 }

 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
@@ -119,19 +49,15 @@ mod tests {
        let doc = PageserverUtilization {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
-            disk_wanted_bytes: u64::MAX,
-            utilization_score: 13,
-            disk_usable_pct: Percent::new(90).unwrap(),
-            shard_count: 100,
-            max_shard_count: 200,
-            captured_at: serde_system_time::SystemTime(
+            utilization_score: u64::MAX,
+            captured_at: SystemTime(
                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
            ),
        };

        let s = serde_json::to_string(&doc).unwrap();

-        let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
+        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;

        assert_eq!(s, expected);
    }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,20 +144,7 @@ impl PgConnectionConfig {
            // implement and this function is hardly a bottleneck. The function is only called around
            // establishing a new connection.
            #[allow(unstable_name_collisions)]
-            config.options(
-                &self
-                    .options
-                    .iter()
-                    .map(|s| {
-                        if s.contains(['\\', ' ']) {
-                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-                        } else {
-                            Cow::Borrowed(s.as_str())
-                        }
-                    })
-                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-                    .collect::<String>(),
-            );
+            config.options(&encode_options(&self.options));
        }
        config
    }
@@ -178,6 +165,21 @@ impl PgConnectionConfig {
    }
 }

+#[allow(unstable_name_collisions)]
+fn encode_options(options: &[String]) -> String {
+    options
+        .iter()
+        .map(|s| {
+            if s.contains(['\\', ' ']) {
+                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+            } else {
+                Cow::Borrowed(s.as_str())
+            }
+        })
+        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+        .collect::<String>()
+}
+
 impl fmt::Display for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // The password is intentionally hidden and not part of this display string.
@@ -206,7 +208,7 @@ impl fmt::Debug for PgConnectionConfig {

 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::PgConnectionConfig;
+    use crate::{encode_options, PgConnectionConfig};
    use once_cell::sync::Lazy;
    use url::Host;

@@ -255,18 +257,12 @@ mod tests_pg_connection_config {

    #[test]
    fn test_with_options() {
-        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
-            "hello",
-            "world",
-            "with space",
-            "and \\ backslashes",
+        let options = encode_options(&[
+            "hello".to_owned(),
+            "world".to_owned(),
+            "with space".to_owned(),
+            "and \\ backslashes".to_owned(),
        ]);
-        assert_eq!(cfg.host(), &*STUB_HOST);
-        assert_eq!(cfg.port(), 123);
-        assert_eq!(cfg.raw_address(), "stub.host.example:123");
-        assert_eq!(
-            cfg.to_tokio_postgres_config().get_options(),
-            Some("hello world with\\ space and\\ \\\\\\ backslashes")
-        );
+        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
    }
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,7 +144,6 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
-#[derive(Copy, Clone)]
 pub enum ListingMode {
    WithDelimiter,
    NoDelimiter,
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -5,40 +5,13 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
 pub struct Completion {
-    token: TaskTrackerToken,
-}
-
-impl std::fmt::Debug for Completion {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Completion")
-            .field("siblings", &self.token.task_tracker().len())
-            .finish()
-    }
-}
-
-impl Completion {
-    /// Returns true if this completion is associated with the given barrier.
-    pub fn blocks(&self, barrier: &Barrier) -> bool {
-        TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0)
-    }
-
-    pub fn barrier(&self) -> Barrier {
-        Barrier(self.token.task_tracker().clone())
-    }
+    _token: TaskTrackerToken,
 }

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
 pub struct Barrier(TaskTracker);

-impl std::fmt::Debug for Barrier {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Barrier")
-            .field("remaining", &self.0.len())
-            .finish()
-    }
-}
-
 impl Default for Barrier {
    fn default() -> Self {
        let (_, rx) = channel();
@@ -78,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
    tracker.close();

    let token = tracker.token();
-    (Completion { token }, Barrier(tracker))
+    (Completion { _token: token }, Barrier(tracker))
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -128,7 +128,7 @@ pub mod circuit_breaker;
 ///
 /// #############################################################################################
 /// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -78,9 +78,8 @@ impl Drop for GateGuard {
    }
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(Debug)]
 pub enum GateError {
-    #[error("gate is closed")]
    GateClosed,
 }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,7 +49,6 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
-range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
@@ -108,7 +107,3 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
-
-[[bench]]
-name = "bench_ingest"
-harness = false
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -1,239 +0,0 @@
-use std::{env, num::NonZeroUsize};
-
-use bytes::Bytes;
-use camino::Utf8PathBuf;
-use criterion::{criterion_group, criterion_main, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    context::{DownloadBehavior, RequestContext},
-    l0_flush::{L0FlushConfig, L0FlushGlobalState},
-    page_cache,
-    repository::Value,
-    task_mgr::TaskKind,
-    tenant::storage_layer::InMemoryLayer,
-    virtual_file,
-};
-use pageserver_api::{key::Key, shard::TenantShardId};
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-};
-
-// A very cheap hash for generating non-sequential keys.
-fn murmurhash32(mut h: u32) -> u32 {
-    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
-    h ^= h >> 13;
-    h = h.wrapping_mul(0xc2b2ae35);
-    h ^= h >> 16;
-    h
-}
-
-enum KeyLayout {
-    /// Sequential unique keys
-    Sequential,
-    /// Random unique keys
-    Random,
-    /// Random keys, but only use the bits from the mask of them
-    RandomReuse(u32),
-}
-
-enum WriteDelta {
-    Yes,
-    No,
-}
-
-async fn ingest(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) -> anyhow::Result<()> {
-    let mut lsn = utils::lsn::Lsn(1000);
-    let mut key = Key::from_i128(0x0);
-
-    let timeline_id = TimelineId::generate();
-    let tenant_id = TenantId::generate();
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
-
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-
-    let gate = utils::sync::gate::Gate::default();
-    let entered = gate.enter().unwrap();
-
-    let layer =
-        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
-
-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
-    let ctx = RequestContext::new(
-        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
-        pageserver::context::DownloadBehavior::Download,
-    );
-
-    for i in 0..put_count {
-        lsn += put_size as u64;
-
-        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
-        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
-        match key_layout {
-            KeyLayout::Sequential => {
-                // Use sequential order to illustrate the experience a user is likely to have
-                // when ingesting bulk data.
-                key.field6 = i as u32;
-            }
-            KeyLayout::Random => {
-                // Use random-order keys to avoid giving a false advantage to data structures that are
-                // faster when inserting on the end.
-                key.field6 = murmurhash32(i as u32);
-            }
-            KeyLayout::RandomReuse(mask) => {
-                // Use low bits only, to limit cardinality
-                key.field6 = murmurhash32(i as u32) & mask;
-            }
-        }
-
-        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
-    }
-    layer.freeze(lsn + 1).await;
-
-    if matches!(write_delta, WriteDelta::Yes) {
-        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
-            max_concurrency: NonZeroUsize::new(1).unwrap(),
-        });
-        let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
-            .await?
-            .unwrap();
-        tokio::fs::remove_file(path).await?;
-    }
-
-    Ok(())
-}
-
-/// Wrapper to instantiate a tokio runtime
-fn ingest_main(
-    conf: &'static PageServerConf,
-    put_size: usize,
-    put_count: usize,
-    key_layout: KeyLayout,
-    write_delta: WriteDelta,
-) {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
-    runtime.block_on(async move {
-        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
-        if let Err(e) = r {
-            panic!("{e:?}");
-        }
-    });
-}
-
-/// Declare a series of benchmarks for the Pageserver's ingest write path.
-///
-/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
-/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
-///
-/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
-/// a fast disk, CPU is the bottleneck at time of writing.
-fn criterion_benchmark(c: &mut Criterion) {
-    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
-    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
-    eprintln!("Data directory: {}", temp_dir.path());
-
-    let conf: &'static PageServerConf = Box::leak(Box::new(
-        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
-    ));
-    virtual_file::init(16384, virtual_file::io_engine_for_bench());
-    page_cache::init(conf.page_cache_size);
-
-    {
-        let mut group = c.benchmark_group("ingest-small-values");
-        let put_size = 100usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/100b seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Random,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::RandomReuse(0x3ff),
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-
-    {
-        let mut group = c.benchmark_group("ingest-big-values");
-        let put_size = 8192usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/8k seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,4 +1,3 @@
-use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -16,11 +15,7 @@ use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

-use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
-
-fn fixture_path(relative: &str) -> PathBuf {
-    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-}
+use criterion::{black_box, criterion_group, criterion_main, Criterion};

 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    let mut layer_map = LayerMap::default();
@@ -114,7 +109,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
    // TODO consider compressing this file
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);

    // Test with uniform query pattern
@@ -144,7 +139,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
    // Init layer map
    let now = Instant::now();
-    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
    println!("Finished layer map init in {:?}", now.elapsed());

    // Choose uniformly distributed queries
@@ -247,72 +242,7 @@ fn bench_sequential(c: &mut Criterion) {
    group.finish();
 }

-fn bench_visibility_with_map(
-    group: &mut BenchmarkGroup<WallTime>,
-    layer_map: LayerMap,
-    read_points: Vec<Lsn>,
-    bench_name: &str,
-) {
-    group.bench_function(bench_name, |b| {
-        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
-    });
-}
-
-// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
-fn bench_visibility(c: &mut Criterion) {
-    let mut group = c.benchmark_group("visibility");
-    {
-        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
-        let now = Instant::now();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for i in 0..100_000 {
-            let i32 = (i as u32) % 100;
-            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-            let layer = PersistentLayerDesc::new_img(
-                TenantShardId::unsharded(TenantId::generate()),
-                TimelineId::generate(),
-                zero.add(10 * i32)..zero.add(10 * i32 + 1),
-                Lsn(i),
-                0,
-            );
-            updates.insert_historic(layer);
-        }
-        updates.flush();
-        println!("Finished layer map init in {:?}", now.elapsed());
-
-        let mut read_points = Vec::new();
-        for i in (0..100_000).step_by(1000) {
-            read_points.push(Lsn(i));
-        }
-
-        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
-    }
-
-    {
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![Lsn(0x1C760FA190)];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
-
-        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
-        let read_points = vec![
-            Lsn(0x1C760FA190),
-            Lsn(0x000000931BEAD539),
-            Lsn(0x000000931BF63011),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x00000038E67ABFA0),
-            Lsn(0x000000931B33AE68),
-            Lsn(0x000000914E3F38F0),
-            Lsn(0x000000931B33AE68),
-        ];
-        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
-    }
-
-    group.finish();
-}
-
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_group!(group_4, bench_visibility);
-criterion_main!(group_1, group_2, group_3, group_4);
+criterion_main!(group_1, group_2, group_3);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -29,9 +31,11 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
-    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -123,7 +127,8 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
+    info!(?conf.get_impl, "starting with get page implementation");
+    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

    let tenants_path = conf.tenants_path();
@@ -589,13 +594,30 @@ fn start_pageserver(

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -623,7 +645,7 @@ fn start_pageserver(
            shutdown_pageserver.take();
            pageserver::shutdown_pageserver(
                http_endpoint_listener,
-                page_service,
+                libpq_listener,
                consumption_metrics_tasks,
                disk_usage_eviction_task,
                &tenant_manager,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,12 +29,12 @@ use utils::{
    logging::LogFormat,
 };

-use crate::l0_flush::L0FlushConfig;
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
+use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};

@@ -133,8 +133,14 @@ pub mod defaults {

 #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'

+#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
+
+#get_impl = '{DEFAULT_GET_IMPL}'
+
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'

+#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -272,8 +278,14 @@ pub struct PageServerConf {

    pub virtual_file_io_engine: virtual_file::IoEngineKind,

+    pub get_vectored_impl: GetVectoredImpl,
+
+    pub get_impl: GetImpl,
+
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

+    pub validate_vectored_get: bool,
+
    pub image_compression: ImageCompressionAlgorithm,

    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
@@ -288,9 +300,6 @@ pub struct PageServerConf {
    /// This flag is temporary and will be removed after gradual rollout.
    /// See <https://github.com/neondatabase/neon/issues/8184>.
    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
-
-    /// Direct IO settings
-    pub virtual_file_direct_io: virtual_file::DirectIoMode,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -384,8 +393,14 @@ struct PageServerConfigBuilder {

    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,

+    get_vectored_impl: BuilderValue<GetVectoredImpl>,
+
+    get_impl: BuilderValue<GetImpl>,
+
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

+    validate_vectored_get: BuilderValue<bool>,
+
    image_compression: BuilderValue<ImageCompressionAlgorithm>,

    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
@@ -393,8 +408,6 @@ struct PageServerConfigBuilder {
    l0_flush: BuilderValue<L0FlushConfig>,

    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
-
-    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
 }

 impl PageServerConfigBuilder {
@@ -475,14 +488,16 @@ impl PageServerConfigBuilder {

            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

+            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
+            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
-            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
        }
    }
 }
@@ -638,10 +653,22 @@ impl PageServerConfigBuilder {
        self.virtual_file_io_engine = BuilderValue::Set(value);
    }

+    pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
+        self.get_vectored_impl = BuilderValue::Set(value);
+    }
+
+    pub fn get_impl(&mut self, value: GetImpl) {
+        self.get_impl = BuilderValue::Set(value);
+    }
+
    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
        self.max_vectored_read_bytes = BuilderValue::Set(value);
    }

+    pub fn get_validate_vectored_get(&mut self, value: bool) {
+        self.validate_vectored_get = BuilderValue::Set(value);
+    }
+
    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
        self.image_compression = BuilderValue::Set(value);
    }
@@ -658,10 +685,6 @@ impl PageServerConfigBuilder {
        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
    }

-    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
-        self.virtual_file_direct_io = BuilderValue::Set(value);
-    }
-
    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -712,12 +735,14 @@ impl PageServerConfigBuilder {
                heatmap_upload_concurrency,
                secondary_download_concurrency,
                ingest_batch_size,
+                get_vectored_impl,
+                get_impl,
                max_vectored_read_bytes,
+                validate_vectored_get,
                image_compression,
                ephemeral_bytes_per_memory_kb,
                l0_flush,
                compact_level0_phase1_value_access,
-                virtual_file_direct_io,
            }
            CUSTOM LOGIC
            {
@@ -966,12 +991,21 @@ impl PageServerConf {
                "virtual_file_io_engine" => {
                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
                }
+                "get_vectored_impl" => {
+                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
+                }
+                "get_impl" => {
+                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
+                }
                "max_vectored_read_bytes" => {
                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                    builder.get_max_vectored_read_bytes(
                        MaxVectoredReadBytes(
                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
                }
+                "validate_vectored_get" => {
+                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
+                }
                "image_compression" => {
                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
                }
@@ -984,9 +1018,6 @@ impl PageServerConf {
                "compact_level0_phase1_value_access" => {
                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
                }
-                "virtual_file_direct_io" => {
-                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1061,15 +1092,17 @@ impl PageServerConf {
            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
            max_vectored_read_bytes: MaxVectoredReadBytes(
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
        }
    }
 }
@@ -1301,15 +1334,17 @@ background_task_maximum_delay = '334 s'
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1374,15 +1409,17 @@ background_task_maximum_delay = '334 s'
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,45 +308,6 @@ paths:
            application/json:
              schema:
                type: string
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently add a gc blocking at the tenant level because of this timeline
-      responses:
-        "200":
-          description: OK
-
-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Persistently remove a tenant level gc blocking for this timeline
-      responses:
-        "200":
-          description: OK
-
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -932,7 +893,7 @@ components:
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
    ArchivalConfigRequest:
      type: object
-      required:
+      required
        - state
      properties:
        state:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => {
                ApiError::ResourceUnavailable(format!("{}", e).into())
            }
-            GetActiveTenantError::SwitchedTenant => {
-                // in our HTTP handlers, this error doesn't happen
-                // TODO: separate error types
-                ApiError::ResourceUnavailable("switched tenant".into())
-            }
        }
    }
 }
@@ -935,7 +930,6 @@ async fn tenant_list_handler(
            generation: (*gen)
                .into()
                .expect("Tenants are always attached with a generation"),
-            gc_blocking: None,
        })
        .collect::<Vec<TenantInfo>>();

@@ -987,7 +981,6 @@ async fn tenant_status(
                    .generation()
                    .into()
                    .expect("Tenants are always attached with a generation"),
-                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1162,10 +1155,7 @@ async fn layer_map_info_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
-    let layer_map_info = timeline
-        .layer_map_info(reset)
-        .await
-        .map_err(|_shutdown| ApiError::ShuttingDown)?;
+    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
 }
@@ -1231,72 +1221,6 @@ async fn evict_timeline_layer_handler(
    }
 }

-async fn timeline_gc_blocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, true).await
-}
-
-async fn timeline_gc_unblocking_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    block_or_unblock_gc(request, false).await
-}
-
-/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
-///
-/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
-async fn block_or_unblock_gc(
-    request: Request<Body>,
-    block: bool,
-) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::{
-        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
-    };
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let state = get_state(&request);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    let timeline = tenant.get_timeline(timeline_id, true)?;
-
-    let fut = async {
-        if block {
-            timeline.block_gc(&tenant).await.map(|_| ())
-        } else {
-            timeline.unblock_gc(&tenant).await
-        }
-    };
-
-    let span = tracing::info_span!(
-        "block_or_unblock_gc",
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %timeline_id,
-        block = block,
-    );
-
-    let res = fut.instrument(span).await;
-
-    res.map_err(|e| {
-        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
-            ApiError::ShuttingDown
-        } else {
-            ApiError::InternalServerError(e)
-        }
-    })?;
-
-    json_response(StatusCode::OK, ())
-}
-
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
    inputs: ModelInputs,
@@ -1887,7 +1811,7 @@ async fn timeline_detach_ancestor_handler(
        // drop(tenant);

        let resp = match progress {
-            detach_ancestor::Progress::Prepared(attempt, prepared) => {
+            detach_ancestor::Progress::Prepared(_guard, prepared) => {
                // it would be great to tag the guard on to the tenant activation future
                let reparented_timelines = state
                    .tenant_manager
@@ -1895,10 +1819,10 @@ async fn timeline_detach_ancestor_handler(
                        tenant_shard_id,
                        timeline_id,
                        prepared,
-                        attempt,
                        ctx,
                    )
                    .await
+                    .context("timeline detach ancestor completion")
                    .map_err(ApiError::InternalServerError)?;

                AncestorDetached {
@@ -2357,9 +2281,8 @@ async fn get_utilization(
    // regenerate at most 1Hz to allow polling at any rate.
    if !still_valid {
        let path = state.conf.tenants_path();
-        let doc =
-            crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager)
-                .map_err(ApiError::InternalServerError)?;
+        let doc = crate::utilization::regenerate(path.as_std_path())
+            .map_err(ApiError::InternalServerError)?;

        let mut buf = Vec::new();
        serde_json::to_writer(&mut buf, &doc)
@@ -2976,14 +2899,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
-            |r| api_handler(r, timeline_gc_blocking_handler),
-        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
-            |r| api_handler(r, timeline_gc_unblocking_handler),
-        )
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

-pub enum Inner {
+pub(crate) enum Inner {
    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
        }
    }

-    pub fn inner(&self) -> &Arc<Inner> {
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
        &self.0
    }
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
-
-use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -32,13 +30,14 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
    mgr::{BackgroundPurges, TenantManager},
    secondary,
 };
-use tracing::{info, info_span};
+use tracing::info;

 /// Current storage format version
 ///
@@ -64,6 +63,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
-    page_service: page_service::Listener,
+    libpq_listener: LibpqEndpointListener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
    exit_code: i32,
 ) {
    use std::time::Duration;
-
-    // If the orderly shutdown below takes too long, we still want to make
-    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
-    //
-    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
-    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
-    //
-    // We use a thread instead of a tokio task because the background runtime is likely busy
-    // with the final flushing / uploads. This activity here has priority, and due to lack
-    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
-    // an effective priority booster.
-    let walredo_extraordinary_shutdown_thread_span = {
-        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
-        span.follows_from(tracing::Span::current());
-        span
-    };
-    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
-    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
-        let walredo_extraordinary_shutdown_thread_cancel =
-            walredo_extraordinary_shutdown_thread_cancel.clone();
-        move || {
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .unwrap();
-            let _entered = rt.enter();
-            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
-            if let Ok(()) = rt.block_on(tokio::time::timeout(
-                Duration::from_secs(8),
-                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
-            )) {
-                info!("cancellation requested");
-                return;
-            }
-            let managers = tenant::WALREDO_MANAGERS
-                .lock()
-                .unwrap()
-                // prevents new walredo managers from being inserted
-                .take()
-                .expect("only we take()");
-            // Use FuturesUnordered to get in queue early for each manager's
-            // heavier_once_cell semaphore wait list.
-            // Also, for idle tenants that for some reason haven't
-            // shut down yet, it's quite likely that we're not going
-            // to get Poll::Pending once.
-            let mut futs: FuturesUnordered<_> = managers
-                .into_iter()
-                .filter_map(|(_, mgr)| mgr.upgrade())
-                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
-                .collect();
-            info!(count=%futs.len(), "built FuturesUnordered");
-            let mut last_log_at = std::time::Instant::now();
-            #[derive(Debug, Default)]
-            struct Results {
-                initiated: u64,
-                already: u64,
-            }
-            let mut results = Results::default();
-            while let Some(we_initiated) = rt.block_on(futs.next()) {
-                if we_initiated {
-                    results.initiated += 1;
-                } else {
-                    results.already += 1;
-                }
-                if last_log_at.elapsed() > Duration::from_millis(100) {
-                    info!(remaining=%futs.len(), ?results, "progress");
-                    last_log_at = std::time::Instant::now();
-                }
-            }
-            info!(?results, "done");
-        }
-    });
-
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    let remaining_connections = timed(
-        page_service.stop_accepting(),
+    timed(
+        libpq_listener.0.shutdown(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
-        remaining_connections.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
        Duration::from_secs(1),
    )
    .await;
-
-    info!("cancel & join walredo_extraordinary_shutdown_thread");
-    walredo_extraordinary_shutdown_thread_cancel.cancel();
-    walredo_extraordinary_shutdown_thread.join().unwrap();
-    info!("walredo_extraordinary_shutdown_thread done");
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,15 +525,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_visible_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
        "pageserver_resident_physical_size_global",
@@ -2213,7 +2204,6 @@ pub(crate) struct TimelineMetrics {
    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
-    pub visible_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub aux_file_size_gauge: IntGauge,
@@ -2336,9 +2326,6 @@ impl TimelineMetrics {
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2393,7 +2380,6 @@ impl TimelineMetrics {
            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
-            visible_physical_size_gauge,
            current_logical_size_gauge,
            aux_file_size_gauge,
            directory_entries_count_gauge,
@@ -2445,7 +2431,6 @@ impl TimelineMetrics {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
-        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,6 +56,7 @@ impl Statvfs {
 }

 pub mod mock {
+    use anyhow::Context;
    use camino::Utf8Path;
    use regex::Regex;
    use tracing::log::info;
@@ -134,30 +135,14 @@ pub mod mock {
            {
                continue;
            }
-            let m = match entry.metadata() {
-                Ok(m) => m,
-                Err(e) if is_not_found(&e) => {
-                    // some temp file which got removed right as we are walking
-                    continue;
-                }
-                Err(e) => {
-                    return Err(anyhow::Error::new(e)
-                        .context(format!("get metadata of {:?}", entry.path())))
-                }
-            };
-            total += m.len();
+            total += entry
+                .metadata()
+                .with_context(|| format!("get metadata of {:?}", entry.path()))?
+                .len();
        }
        Ok(total)
    }

-    fn is_not_found(e: &walkdir::Error) -> bool {
-        let Some(io_error) = e.io_error() else {
-            return false;
-        };
-        let kind = io_error.kind();
-        matches!(kind, std::io::ErrorKind::NotFound)
-    }
-
    pub struct Statvfs {
        pub blocks: u64,
        pub blocks_available: u64,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,7 +33,6 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
-use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -41,7 +40,6 @@ use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use upload_queue::NotInitialized;
 use utils::backoff;
 use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
@@ -149,7 +147,6 @@ pub(crate) mod timeline;

 pub mod size;

-mod gc_block;
 pub(crate) mod throttle;

 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -302,19 +299,9 @@ pub struct Tenant {
    pub(crate) timeline_get_throttle:
        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,

-    /// An ongoing timeline detach concurrency limiter.
-    ///
-    /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
-    /// to have two running at the same time. A different one can be started if an earlier one
-    /// has failed for whatever reason.
+    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

-    /// `index_part.json` based gc blocking reason tracking.
-    ///
-    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
-    /// proceeding.
-    pub(crate) gc_block: gc_block::GcBlock,
-
    l0_flush_global_state: L0FlushGlobalState,
 }

@@ -325,66 +312,14 @@ impl std::fmt::Debug for Tenant {
 }

 pub(crate) enum WalRedoManager {
-    Prod(WalredoManagerId, PostgresRedoManager),
+    Prod(PostgresRedoManager),
    #[cfg(test)]
    Test(harness::TestRedoManager),
 }

-#[derive(thiserror::Error, Debug)]
-#[error("pageserver is shutting down")]
-pub(crate) struct GlobalShutDown;
-
-impl WalRedoManager {
-    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
-        let id = WalredoManagerId::next();
-        let arc = Arc::new(Self::Prod(id, mgr));
-        let mut guard = WALREDO_MANAGERS.lock().unwrap();
-        match &mut *guard {
-            Some(map) => {
-                map.insert(id, Arc::downgrade(&arc));
-                Ok(arc)
-            }
-            None => Err(GlobalShutDown),
-        }
-    }
-}
-
-impl Drop for WalRedoManager {
-    fn drop(&mut self) {
-        match self {
-            Self::Prod(id, _) => {
-                let mut guard = WALREDO_MANAGERS.lock().unwrap();
-                if let Some(map) = &mut *guard {
-                    map.remove(id).expect("new() registers, drop() unregisters");
-                }
-            }
-            #[cfg(test)]
-            Self::Test(_) => {
-                // Not applicable to test redo manager
-            }
-        }
-    }
-}
-
-/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
-/// the walredo processes outside of the regular order.
-///
-/// This is necessary to work around a systemd bug where it freezes if there are
-/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
-#[allow(clippy::type_complexity)]
-pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
-    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
-> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
-#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
-pub(crate) struct WalredoManagerId(u64);
-impl WalredoManagerId {
-    pub fn next() -> Self {
-        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
-        }
-        Self(id)
+impl From<PostgresRedoManager> for WalRedoManager {
+    fn from(mgr: PostgresRedoManager) -> Self {
+        Self::Prod(mgr)
    }
 }

@@ -396,20 +331,19 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
-    pub(crate) async fn shutdown(&self) -> bool {
+    pub(crate) async fn shutdown(&self) {
        match self {
-            Self::Prod(_, mgr) => mgr.shutdown().await,
+            Self::Prod(mgr) => mgr.shutdown().await,
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
-                true
            }
        }
    }

    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
-            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
+            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
@@ -429,7 +363,7 @@ impl WalRedoManager {
        pg_version: u32,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
-            Self::Prod(_, mgr) => {
+            Self::Prod(mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
                    .await
            }
@@ -443,7 +377,7 @@ impl WalRedoManager {

    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(_, m) => Some(m.status()),
+            WalRedoManager::Prod(m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -452,8 +386,6 @@ impl WalRedoManager {

 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
-    #[error("Timeline is shutting down")]
-    ShuttingDown,
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
        tenant_id: TenantShardId,
@@ -606,21 +538,6 @@ impl From<PageReconstructError> for GcError {
    }
 }

-impl From<NotInitialized> for GcError {
-    fn from(value: NotInitialized) -> Self {
-        match value {
-            NotInitialized::Uninitialized => GcError::Remote(value.into()),
-            NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
-        }
-    }
-}
-
-impl From<timeline::layer_manager::Shutdown> for GcError {
-    fn from(_: timeline::layer_manager::Shutdown) -> Self {
-        GcError::TimelineCancelled
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum LoadConfigError {
    #[error("TOML deserialization error: '{0}'")]
@@ -730,7 +647,6 @@ impl Tenant {
                    .read()
                    .await
                    .layer_map()
-                    .expect("currently loading, layer manager cannot be shutdown already")
                    .iter_historic_layers()
                    .next()
                    .is_some(),
@@ -759,9 +675,11 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
-        let wal_redo_manager =
-            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
+    ) -> Arc<Tenant> {
+        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+            conf,
+            tenant_shard_id,
+        )));

        let TenantSharedResources {
            broker_client,
@@ -837,9 +755,9 @@ impl Tenant {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
                            assert!(
-                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                                "the attach task owns the tenant state until activation is complete"
-                            );
+                            matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
+                            "the attach task owns the tenant state until activation is complete"
+                        );

                            *state = TenantState::broken_from_reason(err.to_string());
                        });
@@ -960,7 +878,7 @@ impl Tenant {
            }
            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
-        Ok(tenant)
+        tenant
    }

    #[instrument(skip_all)]
@@ -1064,8 +982,6 @@ impl Tenant {
            }
        }

-        let mut gc_blocks = HashMap::new();
-
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
@@ -1075,16 +991,6 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

-            if let Some(blocking) = index_part.gc_blocking.as_ref() {
-                // could just filter these away, but it helps while testing
-                anyhow::ensure!(
-                    !blocking.reasons.is_empty(),
-                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
-                );
-                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
-                assert!(prev.is_none());
-            }
-
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
@@ -1129,8 +1035,6 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        self.gc_block.set_scanned(gc_blocks);
-
        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
@@ -1676,7 +1580,7 @@ impl Tenant {
        self: Arc<Self>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id).await?;
+        DeleteTimelineFlow::run(&self, timeline_id, false).await?;

        Ok(())
    }
@@ -1721,14 +1625,6 @@ impl Tenant {
            }
        }

-        let _guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(reasons) => {
-                info!("Skipping GC: {reasons}");
-                return Ok(GcResult::default());
-            }
-        };
-
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }
@@ -2741,7 +2637,6 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
-            gc_block: Default::default(),
            l0_flush_global_state,
        }
    }
@@ -3026,6 +2921,54 @@ impl Tenant {
        // because that will stall branch creation.
        let gc_cs = self.gc_cs.lock().await;

+        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
+        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
+        // and fail out if it's inaccurate.
+        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
+        {
+            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
+                BTreeMap::new();
+            timelines.iter().for_each(|timeline| {
+                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
+                    let ancestor_children =
+                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
+                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
+                }
+            });
+
+            for timeline in &timelines {
+                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
+                    .remove(&timeline.timeline_id)
+                    .unwrap_or_default();
+
+                branchpoints.sort_by_key(|b| b.0);
+
+                let target = timeline.gc_info.read().unwrap();
+
+                // We require that retain_lsns contains everything in `branchpoints`, but not that
+                // they are exactly equal: timeline deletions can race with us, so retain_lsns
+                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
+                // just means that we retain slightly more data than we otherwise might.
+                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
+                for b in &branchpoints {
+                    if !have_branchpoints.contains(b) {
+                        tracing::error!(
+                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
+                            branchpoints,
+                            target.retain_lsns
+                        );
+                        debug_assert!(false);
+                        // Do not GC based on bad information!
+                        // (ab-use an existing GcError type rather than adding a new one, since this is a
+                        // "should never happen" check that will be removed soon).
+                        return Err(GcError::Remote(anyhow::anyhow!(
+                            "retain_lsns failed validation!"
+                        )));
+                    }
+                }
+            }
+        }
+
        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
        let mut gc_timelines = Vec::with_capacity(timelines.len());
@@ -3736,19 +3679,6 @@ impl Tenant {
    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
        self.tenant_conf.load().tenant_conf.clone()
    }
-
-    /// How much local storage would this tenant like to have?  It can cope with
-    /// less than this (via eviction and on-demand downloads), but this function enables
-    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
-    /// by keeping important things on local disk.
-    pub(crate) fn local_storage_wanted(&self) -> u64 {
-        let mut wanted = 0;
-        let timelines = self.timelines.lock().unwrap();
-        for timeline in timelines.values() {
-            wanted += timeline.metrics.visible_physical_size_gauge.get();
-        }
-        wanted
-    }
 }

 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
@@ -4108,7 +4038,7 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
-    use std::collections::{BTreeMap, BTreeSet};
+    use std::collections::BTreeMap;

    use super::*;
    use crate::keyspace::KeySpaceAccum;
@@ -4660,10 +4590,10 @@ mod tests {

        let layer_map = tline.layers.read().await;
        let level0_deltas = layer_map
-            .layer_map()?
-            .level0_deltas()
-            .iter()
-            .map(|desc| layer_map.get_from_desc(desc))
+            .layer_map()
+            .get_level0_deltas()
+            .into_iter()
+            .map(|desc| layer_map.get_from_desc(&desc))
            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());
@@ -4783,7 +4713,7 @@ mod tests {
        lsn: Lsn,
        repeat: usize,
        key_count: usize,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+    ) -> anyhow::Result<()> {
        let compact = true;
        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
    }
@@ -4796,9 +4726,7 @@ mod tests {
        repeat: usize,
        key_count: usize,
        compact: bool,
-    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
-        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
-
+    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;

@@ -4819,7 +4747,6 @@ mod tests {
                        ctx,
                    )
                    .await?;
-                inserted.entry(test_key).or_default().insert(lsn);
                writer.finish_write(lsn);
                drop(writer);

@@ -4844,7 +4771,7 @@ mod tests {
            assert_eq!(res.layers_removed, 0, "this never removes anything");
        }

-        Ok(inserted)
+        Ok(())
    }

    //
@@ -4891,16 +4818,14 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
-        let lm = guard.layer_map()?;
-
-        lm.dump(true, &ctx).await?;
+        guard.layer_map().dump(true, &ctx).await?;

        let mut reads = Vec::new();
        let mut prev = None;
-        lm.iter_historic_layers().for_each(|desc| {
+        guard.layer_map().iter_historic_layers().for_each(|desc| {
            if !desc.is_delta() {
                prev = Some(desc.clone());
                return;
@@ -4954,39 +4879,9 @@ mod tests {
                    &ctx,
                )
                .await;
-
-            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
-            let mut expect_missing = false;
-            let mut key = read.start().unwrap();
-            while key != read.end().unwrap() {
-                if let Some(lsns) = inserted.get(&key) {
-                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
-                    match expected_lsn {
-                        Some(lsn) => {
-                            expected_lsns.insert(key, *lsn);
-                        }
-                        None => {
-                            expect_missing = true;
-                            break;
-                        }
-                    }
-                } else {
-                    expect_missing = true;
-                    break;
-                }
-
-                key = key.next();
-            }
-
-            if expect_missing {
-                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
-            } else {
-                for (key, image) in vectored_res? {
-                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
-                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
-                    assert_eq!(image?, expected_image);
-                }
-            }
+            tline
+                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
+                .await;
        }

        Ok(())
@@ -5036,6 +4931,10 @@ mod tests {
            )
            .await;

+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
        let images = vectored_res?;
        assert!(images.is_empty());
        Ok(())
@@ -5906,12 +5805,23 @@ mod tests {
            tline.freeze_and_flush().await?; // force create a delta layer
        }

-        let before_num_l0_delta_files =
-            tline.layers.read().await.layer_map()?.level0_deltas().len();
+        let before_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()
+            .len();

        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;

-        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
+        let after_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()
+            .len();

        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");

@@ -6935,10 +6845,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
@@ -7002,11 +6909,7 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: {
-                        let mut key = Key::MAX;
-                        key.field6 -= 1;
-                        Key::MIN..key
-                    },
+                    key_range: Key::MIN..Key::MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
@@ -7025,18 +6928,6 @@ mod tests {
            ]
        );

-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
-            guard.cutoffs.space = Lsn(0x40);
-        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-
        Ok(())
    }

@@ -7369,10 +7260,7 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        for idx in 0..10 {
            assert_eq!(
@@ -7391,18 +7279,6 @@ mod tests {
            );
        }

-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
-            guard.cutoffs.space = Lsn(0x40);
-        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-
        Ok(())
    }

@@ -7471,7 +7347,6 @@ mod tests {
                Lsn(0x60),
                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                3,
-                None,
            )
            .await
            .unwrap();
@@ -7596,7 +7471,7 @@ mod tests {
            ),
        ];
        let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
@@ -7642,114 +7517,6 @@ mod tests {
        };
        assert_eq!(res, expected_res);

-        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
-        // the ancestor image in the test case.
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![(
-                Lsn(0x60),
-                KeyLogAtLsn(vec![(
-                    Lsn(0x60),
-                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
-                )]),
-            )],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x60),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[Lsn(0x30)],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![
-                (
-                    Lsn(0x30),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x20),
-                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-                    )]),
-                ),
-                (
-                    Lsn(0x60),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x60),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
-                    )]),
-                ),
-            ],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
        Ok(())
    }

@@ -7907,10 +7674,6 @@ mod tests {
        ];

        let verify_result = || async {
-            let gc_horizon = {
-                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
-            };
            for idx in 0..10 {
                assert_eq!(
                    tline
@@ -7921,7 +7684,7 @@ mod tests {
                );
                assert_eq!(
                    tline
-                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
@@ -7946,232 +7709,7 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        let mut dryrun_flags = EnumSet::new();
-        dryrun_flags.insert(CompactFlags::DryRun);
-
-        tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
-            .await
-            .unwrap();
-        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
-        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
-        verify_result().await;
-
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        // compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        // increase GC horizon and compact again
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x38);
-            guard.cutoffs.space = Lsn(0x38);
-        }
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
-
-        // not increasing the GC horizon and compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x28),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(5),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(6),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-        ];
-
-        let parent_tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![],                       // delta layers
-                vec![(Lsn(0x18), img_layer)], // image layers
-                Lsn(0x18),
-            )
-            .await?;
-
-        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        let branch_tline = tenant
-            .branch_timeline_test_with_layers(
-                &parent_tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x18)),
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-
-        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        {
-            // Update GC info
-            let mut guard = parent_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x10),
-                    space: Lsn(0x10),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        {
-            // Update GC info
-            let mut guard = branch_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x50),
-                    space: Lsn(0x50),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        let expected_result_at_gc_horizon = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
-        ];
-
-        let expected_result_at_lsn_40 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let verify_result = || async {
-            for idx in 0..10 {
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_gc_horizon[idx]
-                );
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_40[idx]
-                );
-            }
-        };
-
-        verify_result().await;
-
-        let cancel = CancellationToken::new();
-        branch_tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();

        verify_result().await;

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -29,7 +29,6 @@ impl EphemeralFile {
        conf: &PageServerConf,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<EphemeralFile, io::Error> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -52,12 +51,10 @@ impl EphemeralFile {
        )
        .await?;

-        let prewarm = conf.l0_flush.prewarm_on_write();
-
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
        })
    }

@@ -164,11 +161,7 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;

-        let gate = utils::sync::gate::Gate::default();
-
-        let entered = gate.enter().unwrap();
-
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;

        let pos_foo = file.write_blob(b"foo", &ctx).await?;
        assert_eq!(
@@ -222,38 +215,4 @@ mod tests {

        Ok(())
    }
-
-    #[tokio::test]
-    async fn ephemeral_file_holds_gate_open() {
-        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
-
-        let (conf, tenant_id, timeline_id, ctx) =
-            harness("ephemeral_file_holds_gate_open").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-            .await
-            .unwrap();
-
-        let mut closing = tokio::task::spawn(async move {
-            gate.close().await;
-        });
-
-        // gate is entered until the ephemeral file is dropped
-        // do not start paused tokio-epoll-uring has a sleep loop
-        tokio::time::pause();
-        tokio::time::timeout(FOREVER, &mut closing)
-            .await
-            .expect_err("closing cannot complete before dropping");
-
-        // this is a requirement of the reset_tenant functionality: we have to be able to restart a
-        // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
-        drop(file);
-
-        tokio::time::timeout(FOREVER, &mut closing)
-            .await
-            .expect("closing completes right away")
-            .expect("closing does not panic");
-    }
 }
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -18,8 +18,6 @@ use super::zero_padded_read_write;
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
-    _gate_guard: utils::sync::gate::GateGuard,
 }

 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -31,11 +29,7 @@ pub enum PrewarmOnWrite {
 }

 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
@@ -44,7 +38,6 @@ impl RW {
                file,
                prewarm_on_write,
            )),
-            _gate_guard,
        }
    }

@@ -152,7 +145,6 @@ impl Drop for RW {
        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.

        // unlink the file
-        // we are clear to do this, because we have entered a gate
        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,213 +0,0 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
-
-use super::remote_timeline_client::index::GcBlockingReason;
-
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
-
-#[derive(Default)]
-pub(crate) struct GcBlock {
-    /// The timelines which have current reasons to block gc.
-    ///
-    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
-    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
-    reasons: std::sync::Mutex<Storage>,
-    blocking: tokio::sync::Mutex<()>,
-}
-
-impl GcBlock {
-    /// Start another gc iteration.
-    ///
-    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
-    /// it's ending, or if not currently possible, a value describing the reasons why not.
-    ///
-    /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
-        let reasons = {
-            let g = self.reasons.lock().unwrap();
-
-            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
-            // tests, we use everything. we should warn if the gc has been consecutively blocked
-            // for more than 1h (within single tenant session?).
-            BlockingReasons::clean_and_summarize(g)
-        };
-
-        if let Some(reasons) = reasons {
-            Err(reasons)
-        } else {
-            Ok(Guard {
-                _inner: self.blocking.lock().await,
-            })
-        }
-    }
-
-    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
-        let g = self.reasons.lock().unwrap();
-
-        BlockingReasons::summarize(&g)
-    }
-
-    /// Start blocking gc for this one timeline for the given reason.
-    ///
-    /// This is not a guard based API but instead it mimics set API. The returned future will not
-    /// resolve until an existing gc round has completed.
-    ///
-    /// Returns true if this block was new, false if gc was already blocked for this reason.
-    ///
-    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
-    /// keep the gc blocking reason.
-    pub(crate) async fn insert(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<bool> {
-        let (added, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
-            let added = set.insert(reason);
-
-            // LOCK ORDER: intentionally hold the lock, see self.reasons.
-            let uploaded = timeline
-                .remote_client
-                .schedule_insert_gc_block_reason(reason)?;
-
-            (added, uploaded)
-        };
-
-        uploaded.await?;
-
-        // ensure that any ongoing gc iteration has completed
-        drop(self.blocking.lock().await);
-
-        Ok(added)
-    }
-
-    /// Remove blocking gc for this one timeline and the given reason.
-    pub(crate) async fn remove(
-        &self,
-        timeline: &super::Timeline,
-        reason: GcBlockingReason,
-    ) -> anyhow::Result<()> {
-        use std::collections::hash_map::Entry;
-
-        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
-
-        let (remaining_blocks, uploaded) = {
-            let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
-                Entry::Occupied(mut oe) => {
-                    let set = oe.get_mut();
-                    set.remove(reason);
-                    if set.is_empty() {
-                        oe.remove();
-                    }
-                }
-                Entry::Vacant(_) => {
-                    // we must still do the index_part.json update regardless, in case we had earlier
-                    // been cancelled
-                }
-            }
-
-            let remaining_blocks = g.len();
-
-            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
-            let uploaded = timeline
-                .remote_client
-                .schedule_remove_gc_block_reason(reason)?;
-
-            (remaining_blocks, uploaded)
-        };
-        uploaded.await?;
-
-        // no need to synchronize with gc iteration again
-
-        if remaining_blocks > 0 {
-            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
-        } else {
-            tracing::info!("gc is now unblocked for the tenant");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
-        let unblocked = {
-            let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
-                return;
-            }
-
-            g.remove(&timeline.timeline_id);
-
-            BlockingReasons::clean_and_summarize(g).is_none()
-        };
-
-        if unblocked {
-            tracing::info!("gc is now unblocked following deletion");
-        }
-    }
-
-    /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
-        let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
-
-        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
-            tracing::info!(summary=?reasons, "initialized with gc blocked");
-        }
-    }
-}
-
-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
-}
-
-#[derive(Debug)]
-pub(crate) struct BlockingReasons {
-    timelines: usize,
-    reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-impl std::fmt::Display for BlockingReasons {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
-        )
-    }
-}
-
-impl BlockingReasons {
-    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
-            reasons = reasons.union(*value);
-            !value.is_empty()
-        });
-        if !g.is_empty() {
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        } else {
-            None
-        }
-    }
-
-    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
-            None
-        } else {
-            let reasons = g
-                .values()
-                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
-            Some(BlockingReasons {
-                timelines: g.len(),
-                reasons,
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,8 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
-use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
+use pageserver_api::keyspace::KeySpaceAccum;
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -62,7 +61,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;

-use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
+use super::storage_layer::PersistentLayerDesc;

 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -846,8 +845,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
-        &self.l0_delta_layers
+    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
+        self.l0_delta_layers.to_vec()
    }

    /// debugging function to print out the contents of the layer map
@@ -872,183 +871,11 @@ impl LayerMap {
        println!("End dump LayerMap");
        Ok(())
    }
-
-    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
-    /// where we expect to serve reads.
-    ///
-    /// This function is O(N) and should be called infrequently.  The caller is responsible for
-    /// looking up and updating the Layer objects for these layer descriptors.
-    pub fn get_visibility(
-        &self,
-        mut read_points: Vec<Lsn>,
-    ) -> (
-        Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
-        KeySpace,
-    ) {
-        // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
-        // KeySpace is intended to be composed statically and iterated over.
-        struct KeyShadow {
-            // Map of range start to range end
-            inner: RangeSetBlaze<i128>,
-        }
-
-        impl KeyShadow {
-            fn new() -> Self {
-                Self {
-                    inner: Default::default(),
-                }
-            }
-
-            fn contains(&self, range: Range<Key>) -> bool {
-                let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
-                self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
-                    CheckSortedDisjoint::from([range_incl]),
-                ))
-            }
-
-            /// Add the input range to the keys covered by self.
-            ///
-            /// Return true if inserting this range covered some keys that were previously not covered
-            fn cover(&mut self, insert: Range<Key>) -> bool {
-                let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
-                self.inner.ranges_insert(range_incl)
-            }
-
-            fn reset(&mut self) {
-                self.inner = Default::default();
-            }
-
-            fn to_keyspace(&self) -> KeySpace {
-                let mut accum = KeySpaceAccum::new();
-                for range_incl in self.inner.ranges() {
-                    let range = Range {
-                        start: Key::from_i128(*range_incl.start()),
-                        end: Key::from_i128(range_incl.end() + 1),
-                    };
-                    accum.add_range(range)
-                }
-
-                accum.to_keyspace()
-            }
-        }
-
-        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
-        // and a ReadPoint
-        read_points.sort_by_key(|rp| rp.0);
-        let mut shadow = KeyShadow::new();
-
-        // We will interleave all our read points and layers into a sorted collection
-        enum Item {
-            ReadPoint { lsn: Lsn },
-            Layer(Arc<PersistentLayerDesc>),
-        }
-
-        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
-        items.extend(self.iter_historic_layers().map(Item::Layer));
-        items.extend(
-            read_points
-                .into_iter()
-                .map(|rp| Item::ReadPoint { lsn: rp }),
-        );
-
-        // Ordering: we want to iterate like this:
-        // 1. Highest LSNs first
-        // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
-        // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
-        items.sort_by_key(|item| {
-            std::cmp::Reverse(match item {
-                Item::Layer(layer) => {
-                    if layer.is_delta() {
-                        (Lsn(layer.get_lsn_range().end.0 - 1), 0)
-                    } else {
-                        (layer.image_layer_lsn(), 1)
-                    }
-                }
-                Item::ReadPoint { lsn } => (*lsn, 2),
-            })
-        });
-
-        let mut results = Vec::with_capacity(self.historic.len());
-
-        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
-
-        for item in items {
-            let (reached_lsn, is_readpoint) = match &item {
-                Item::ReadPoint { lsn } => (lsn, true),
-                Item::Layer(layer) => (&layer.lsn_range.start, false),
-            };
-            maybe_covered_deltas.retain(|d| {
-                if *reached_lsn >= d.lsn_range.start && is_readpoint {
-                    // We encountered a readpoint within the delta layer: it is visible
-
-                    results.push((d.clone(), LayerVisibilityHint::Visible));
-                    false
-                } else if *reached_lsn < d.lsn_range.start {
-                    // We passed the layer's range without encountering a read point: it is not visible
-                    results.push((d.clone(), LayerVisibilityHint::Covered));
-                    false
-                } else {
-                    // We're still in the delta layer: continue iterating
-                    true
-                }
-            });
-
-            match item {
-                Item::ReadPoint { lsn: _lsn } => {
-                    // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
-                    // to assume that the whole key range is visible at the branch point.
-                    shadow.reset();
-                }
-                Item::Layer(layer) => {
-                    let visibility = if layer.is_delta() {
-                        if shadow.contains(layer.get_key_range()) {
-                            // If a layer isn't visible based on current state, we must defer deciding whether
-                            // it is truly not visible until we have advanced past the delta's range: we might
-                            // encounter another branch point within this delta layer's LSN range.
-                            maybe_covered_deltas.push(layer);
-                            continue;
-                        } else {
-                            LayerVisibilityHint::Visible
-                        }
-                    } else {
-                        let modified = shadow.cover(layer.get_key_range());
-                        if modified {
-                            // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
-                            LayerVisibilityHint::Visible
-                        } else {
-                            // An image layer in a region that was already covered
-                            LayerVisibilityHint::Covered
-                        }
-                    };
-
-                    results.push((layer, visibility));
-                }
-            }
-        }
-
-        // Drain any remaining maybe_covered deltas
-        results.extend(
-            maybe_covered_deltas
-                .into_iter()
-                .map(|d| (d, LayerVisibilityHint::Covered)),
-        );
-
-        (results, shadow.to_keyspace())
-    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::tenant::{storage_layer::LayerName, IndexPart};
-    use pageserver_api::{
-        key::DBDIR_KEY,
-        keyspace::{KeySpace, KeySpaceRandomAccum},
-    };
-    use std::{collections::HashMap, path::PathBuf};
-    use utils::{
-        id::{TenantId, TimelineId},
-        shard::TenantShardId,
-    };
+    use pageserver_api::keyspace::KeySpace;

    use super::*;

@@ -1175,299 +1002,4 @@ mod tests {
            }
        }
    }
-
-    #[test]
-    fn layer_visibility_basic() {
-        // A simple synthetic input, as a smoke test.
-        let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
-        let timeline_id = TimelineId::generate();
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-
-        const FAKE_LAYER_SIZE: u64 = 1024;
-
-        let inject_delta = |updates: &mut BatchedUpdates,
-                            key_start: i128,
-                            key_end: i128,
-                            lsn_start: u64,
-                            lsn_end: u64| {
-            let desc = PersistentLayerDesc::new_delta(
-                tenant_shard_id,
-                timeline_id,
-                Range {
-                    start: Key::from_i128(key_start),
-                    end: Key::from_i128(key_end),
-                },
-                Range {
-                    start: Lsn(lsn_start),
-                    end: Lsn(lsn_end),
-                },
-                1024,
-            );
-            updates.insert_historic(desc.clone());
-            desc
-        };
-
-        let inject_image =
-            |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
-                let desc = PersistentLayerDesc::new_img(
-                    tenant_shard_id,
-                    timeline_id,
-                    Range {
-                        start: Key::from_i128(key_start),
-                        end: Key::from_i128(key_end),
-                    },
-                    Lsn(lsn),
-                    FAKE_LAYER_SIZE,
-                );
-                updates.insert_historic(desc.clone());
-                desc
-            };
-
-        //
-        // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
-        // we expect to handle.  You can follow these examples through in the same order as they would be processed
-        // by the function under test.
-        //
-
-        let mut read_points = vec![Lsn(1000)];
-
-        // A delta ahead of any image layer
-        let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
-
-        // An image layer is visible and covers some layers beneath itself
-        let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
-
-        // A delta layer covered by the image layer: should be covered
-        let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
-
-        // A delta layer partially covered by an image layer: should be visible
-        let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
-
-        // A delta layer not covered by an image layer: should be visible
-        let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
-
-        // An image layer covered by the image layer above: should be covered
-        let covered_image = inject_image(&mut updates, 10, 20, 89);
-
-        // An image layer partially covered by an image layer: should be visible
-        let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
-
-        // An image layer not covered by an image layer: should be visible
-        let not_covered_image = inject_image(&mut updates, 1, 4, 89);
-
-        // A read point: this will make subsequent layers below here visible, even if there are
-        // more recent layers covering them.
-        read_points.push(Lsn(80));
-
-        // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
-        let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
-
-        // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
-        // the read point should make it visible, even though its end LSN is covered
-        let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
-        let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
-        read_points.push(Lsn(65));
-        let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
-
-        let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
-
-        updates.flush();
-
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        assert_eq!(
-            layer_visibilities.get(&ahead_layer),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_covering_img),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_delta),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_image),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&partially_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&not_covered_image),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_below_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covering_img_between_read_points),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_between_read_points),
-            Some(&LayerVisibilityHint::Covered)
-        );
-        assert_eq!(
-            layer_visibilities.get(&covered_delta_intersects_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-        assert_eq!(
-            layer_visibilities.get(&visible_img_after_last_read_point),
-            Some(&LayerVisibilityHint::Visible)
-        );
-
-        // Shadow should include all the images below the last read point
-        let expected_shadow = KeySpace {
-            ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
-        };
-        assert_eq!(shadow, expected_shadow);
-    }
-
-    fn fixture_path(relative: &str) -> PathBuf {
-        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
-    }
-
-    #[test]
-    fn layer_visibility_realistic() {
-        // Load a large example layermap
-        let index_raw = std::fs::read_to_string(fixture_path(
-            "test_data/indices/mixed_workload/index_part.json",
-        ))
-        .unwrap();
-        let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
-
-        let tenant_id = TenantId::generate();
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-        let timeline_id = TimelineId::generate();
-
-        let mut layer_map = LayerMap::default();
-        let mut updates = layer_map.batch_update();
-        for (layer_name, layer_metadata) in index.layer_metadata {
-            let layer_desc = match layer_name {
-                LayerName::Image(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range.clone(),
-                    lsn_range: layer_name.lsn_as_range(),
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: false,
-                    file_size: layer_metadata.file_size,
-                },
-                LayerName::Delta(layer_name) => PersistentLayerDesc {
-                    key_range: layer_name.key_range,
-                    lsn_range: layer_name.lsn_range,
-                    tenant_shard_id,
-                    timeline_id,
-                    is_delta: true,
-                    file_size: layer_metadata.file_size,
-                },
-            };
-            updates.insert_historic(layer_desc);
-        }
-        updates.flush();
-
-        let read_points = vec![index.metadata.disk_consistent_lsn()];
-        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
-        for (layer_desc, visibility) in &layer_visibilities {
-            tracing::info!("{layer_desc:?}: {visibility:?}");
-            eprintln!("{layer_desc:?}: {visibility:?}");
-        }
-
-        // The shadow should be non-empty, since there were some image layers
-        assert!(!shadow.ranges.is_empty());
-
-        // At least some layers should be marked covered
-        assert!(layer_visibilities
-            .iter()
-            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
-
-        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
-
-        // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
-        for (layer_desc, visible) in &layer_visibilities {
-            let mut coverage = KeySpaceRandomAccum::new();
-            let mut covered_by = Vec::new();
-
-            for other_layer in layer_map.iter_historic_layers() {
-                if &other_layer == layer_desc {
-                    continue;
-                }
-                if !other_layer.is_delta()
-                    && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
-                    && other_layer.key_range.start <= layer_desc.key_range.end
-                    && layer_desc.key_range.start <= other_layer.key_range.end
-                {
-                    coverage.add_range(other_layer.get_key_range());
-                    covered_by.push((*other_layer).clone());
-                }
-            }
-            let coverage = coverage.to_keyspace();
-
-            let expect_visible = if coverage.ranges.len() == 1
-                && coverage.contains(&layer_desc.key_range.start)
-                && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
-            {
-                LayerVisibilityHint::Covered
-            } else {
-                LayerVisibilityHint::Visible
-            };
-
-            if expect_visible != *visible {
-                eprintln!(
-                    "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
-                    layer_desc.key_range.start,
-                    layer_desc.key_range.end,
-                    layer_desc.lsn_range.start,
-                    layer_desc.lsn_range.end,
-                    layer_desc.is_delta()
-                );
-                if expect_visible == LayerVisibilityHint::Covered {
-                    eprintln!("Covered by:");
-                    for other in covered_by {
-                        eprintln!(
-                            "  {}..{} @ {}",
-                            other.get_key_range().start,
-                            other.get_key_range().end,
-                            other.image_layer_lsn()
-                        );
-                    }
-                    if let Some(range) = coverage.ranges.first() {
-                        eprintln!(
-                            "Total coverage from contributing layers: {}..{}",
-                            range.start, range.end
-                        );
-                    } else {
-                        eprintln!(
-                            "Total coverage from contributing layers: {:?}",
-                            coverage.ranges
-                        );
-                    }
-                }
-            }
-            assert_eq!(expect_visible, *visible);
-        }
-
-        // Sanity: the layer that holds latest data for the DBDIR key should always be visible
-        // (just using this key as a key that will always exist for any layermap fixture)
-        let dbdir_layer = layer_map
-            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
-            .unwrap();
-        assert!(matches!(
-            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
-            LayerVisibilityHint::Visible
-        ));
-    }
 }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,10 +521,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {

        Ok(&self.historic_coverage)
    }
-
-    pub(crate) fn len(&self) -> usize {
-        self.layers.len()
-    }
 }

 #[test]
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -285,15 +285,12 @@ impl TimelineMetadata {
    }

    /// When reparenting, the `ancestor_lsn` does not change.
-    ///
-    /// Returns true if anything was changed.
    pub fn reparent(&mut self, timeline: &TimelineId) {
        assert!(self.body.ancestor_timeline.is_some());
        // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
        self.body.ancestor_timeline = Some(*timeline);
    }

-    /// Returns true if anything was changed
    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
        if let Some(ancestor) = self.body.ancestor_timeline {
            assert_eq!(ancestor, branchpoint.0);
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -54,8 +54,8 @@ use utils::id::{TenantId, TimelineId};

 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
-use super::{GlobalShutDown, TenantSharedResources};
+use super::timeline::detach_ancestor::PreparedTimelineDetach;
+use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
    /// Pick the shard that holds this key
    Page(Key),
    /// The shard ID is known: pick the given shard
@@ -224,8 +226,21 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }

 /// See [`Self::spawn`].
-#[derive(Clone, Default)]
-pub struct BackgroundPurges(tokio_util::task::TaskTracker);
+#[derive(Clone)]
+pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
+enum BackgroundPurgesInner {
+    Open(tokio::task::JoinSet<()>),
+    // we use the async mutex for coalescing
+    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
+}
+
+impl Default for BackgroundPurges {
+    fn default() -> Self {
+        Self(Arc::new(std::sync::Mutex::new(
+            BackgroundPurgesInner::Open(JoinSet::new()),
+        )))
+    }
+}

 impl BackgroundPurges {
    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -234,32 +249,24 @@ impl BackgroundPurges {
    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
-        //
-        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
-        // right after, but at least the shutdown will wait for what we are spawning next.
-        let token = self.0.token();
-
-        if self.0.is_closed() {
-            warn!(
-                %tmp_path,
-                "trying to spawn background purge during shutdown, ignoring"
-            );
-            return;
-        }
-
-        let span = info_span!(parent: None, "background_purge", %tmp_path);
-
-        let task = move || {
-            let _token = token;
-            let _entered = span.entered();
-            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
-                // should we fatal_io_error here?
-                warn!(%error, "failed to purge tenant directory");
+        let mut guard = self.0.lock().unwrap();
+        let jset = match &mut *guard {
+            BackgroundPurgesInner::Open(ref mut jset) => jset,
+            BackgroundPurgesInner::ShuttingDown(_) => {
+                warn!("trying to spawn background purge during shutdown, ignoring");
+                return;
            }
        };
-
-        BACKGROUND_RUNTIME.spawn_blocking(task);
+        jset.spawn_on(
+            async move {
+                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
+                    // should we fatal_io_error here?
+                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
+                }
+            }
+            .instrument(info_span!(parent: None, "background_purge")),
+            BACKGROUND_RUNTIME.handle(),
+        );
    }

    /// When this future completes, all background purges have completed.
@@ -273,9 +280,42 @@ impl BackgroundPurges {
    /// instances of this future will continue to be correct.
    #[instrument(skip_all)]
    pub async fn shutdown(&self) {
-        // forbid new tasks (can be called many times)
-        self.0.close();
-        self.0.wait().await;
+        let jset = {
+            let mut guard = self.0.lock().unwrap();
+            match &mut *guard {
+                BackgroundPurgesInner::Open(jset) => {
+                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
+                        std::mem::take(jset),
+                    )))
+                }
+                BackgroundPurgesInner::ShuttingDown(_) => {
+                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
+                    warn!("already shutting down");
+                }
+            };
+            match &mut *guard {
+                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
+                BackgroundPurgesInner::Open(_) => {
+                    unreachable!("above code transitions into shut down state");
+                }
+            }
+        };
+        let mut jset = jset.lock().await; // concurrent callers coalesce here
+        while let Some(res) = jset.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(e) if e.is_panic() => {
+                    // If it panicked, the error is already logged by the panic hook.
+                }
+                Err(e) if e.is_cancelled() => {
+                    unreachable!("we don't cancel the joinset or runtime")
+                }
+                Err(e) => {
+                    // No idea when this can happen, but let's log it.
+                    warn!(%e, "background purge task failed or panicked");
+                }
+            }
+        }
    }
 }

@@ -627,20 +667,17 @@ pub async fn init_tenant_mgr(
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
-                tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                )
-                .expect("global shutdown during init_tenant_mgr cannot happen"),
-            ),
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                &ctx,
+            )),
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -688,7 +725,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Arc<Tenant> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -1155,10 +1192,7 @@ impl TenantManager {
                    None,
                    spawn_mode,
                    ctx,
-                )
-                .map_err(|_: GlobalShutDown| {
-                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
-                })?;
+                );

                TenantSlot::Attached(tenant)
            }
@@ -1279,7 +1313,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        )?;
+        );

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -1729,9 +1763,14 @@ impl TenantManager {
            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
            for timeline in timelines.values() {
                tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let layers = timeline.layers.read().await;
+                let timeline_layers = timeline
+                    .layers
+                    .read()
+                    .await
+                    .likely_resident_layers()
+                    .collect::<Vec<_>>();

-                for layer in layers.likely_resident_layers() {
+                for layer in timeline_layers {
                    let relative_path = layer
                        .local_path()
                        .strip_prefix(&parent_path)
@@ -1927,11 +1966,8 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        prepared: PreparedTimelineDetach,
-        mut attempt: detach_ancestor::Attempt,
        ctx: &RequestContext,
-    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        use crate::tenant::timeline::detach_ancestor::Error;
-        // FIXME: this is unnecessary, slotguard already has these semantics
+    ) -> Result<Vec<TimelineId>, anyhow::Error> {
        struct RevertOnDropSlot(Option<SlotGuard>);

        impl Drop for RevertOnDropSlot {
@@ -1979,98 +2015,43 @@ impl TenantManager {

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let resp = timeline
-            .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
+        let reparented = timeline
+            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
            .await?;

        let mut slot_guard = slot_guard.into_inner();

-        let tenant = if resp.reset_tenant_required() {
-            attempt.before_reset_tenant();
-
-            let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Hard).await {
-                Ok(()) => {
-                    slot_guard.drop_old_value()?;
-                }
-                Err(_barrier) => {
-                    slot_guard.revert();
-                    // this really should not happen, at all, unless shutdown was already going?
-                    anyhow::bail!("Cannot restart Tenant, already shutting down");
-                }
+        let (_guard, progress) = utils::completion::channel();
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
+            Ok(()) => {
+                slot_guard.drop_old_value()?;
            }
-
-            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-            let shard_identity = config.shard;
-            let tenant = tenant_spawn(
-                self.conf,
-                tenant_shard_id,
-                &tenant_path,
-                self.resources.clone(),
-                AttachedTenantConf::try_from(config)?,
-                shard_identity,
-                None,
-                SpawnMode::Eager,
-                ctx,
-            )?;
-
-            {
-                let mut g = tenant.ongoing_timeline_detach.lock().unwrap();
-                assert!(
-                    g.is_none(),
-                    "there cannot be any new timeline detach ancestor on newly created tenant"
-                );
-                *g = Some((attempt.timeline_id, attempt.new_barrier()));
+            Err(_barrier) => {
+                slot_guard.revert();
+                // this really should not happen, at all, unless shutdown was already going?
+                anyhow::bail!("Cannot restart Tenant, already shutting down");
            }
-
-            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
-            tenant
-        } else {
-            tracing::info!("skipping tenant_reset as no changes made required it");
-            tenant
-        };
-
-        if let Some(reparented) = resp.completed() {
-            // finally ask the restarted tenant to complete the detach
-            //
-            // rationale for 9999s: we don't really have a timetable here; if retried, the caller
-            // will get an 503.
-            tenant
-                .wait_to_become_active(std::time::Duration::from_secs(9999))
-                .await
-                .map_err(|e| {
-                    use pageserver_api::models::TenantState;
-                    use GetActiveTenantError::{Cancelled, WillNotBecomeActive};
-                    match e {
-                        Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
-                            Error::ShuttingDown
-                        }
-                        other => Error::Unexpected(other.into()),
-                    }
-                })?;
-
-            utils::pausable_failpoint!(
-                "timeline-detach-ancestor::after_activating_before_finding-pausable"
-            );
-
-            let timeline = tenant
-                .get_timeline(attempt.timeline_id, true)
-                .map_err(|_| Error::DetachedNotFoundAfterRestart)?;
-
-            timeline
-                .complete_detaching_timeline_ancestor(&tenant, attempt, ctx)
-                .await
-                .map(|()| reparented)
-                .map_err(|e| e.into())
-        } else {
-            // at least the latest versions have now been downloaded and refreshed; be ready to
-            // retry another time.
-            Err(anyhow::anyhow!(
-                "failed to reparent all candidate timelines, please retry"
-            ))
        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+        let shard_identity = config.shard;
+        let tenant = tenant_spawn(
+            self.conf,
+            tenant_shard_id,
+            &tenant_path,
+            self.resources.clone(),
+            AttachedTenantConf::try_from(config)?,
+            shard_identity,
+            None,
+            SpawnMode::Eager,
+            ctx,
+        );
+
+        slot_guard.upsert(TenantSlot::Attached(tenant))?;
+
+        Ok(reparented)
    }

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
@@ -2107,6 +2088,7 @@ impl TenantManager {
                    };

                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return ShardResolveResult::Found(tenant.clone())
                        }
@@ -2142,57 +2124,6 @@ impl TenantManager {
            }
        }
    }
-
-    /// Calculate the tenant shards' contributions to this pageserver's utilization metrics.  The
-    /// returned values are:
-    ///  - the number of bytes of local disk space this pageserver's shards are requesting, i.e.
-    ///    how much space they would use if not impacted by disk usage eviction.
-    ///  - the number of tenant shards currently on this pageserver, including attached
-    ///    and secondary.
-    ///
-    /// This function is quite expensive: callers are expected to cache the result and
-    /// limit how often they call it.
-    pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> {
-        let tenants = self.tenants.read().unwrap();
-        let m = match &*tenants {
-            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
-        };
-        let shard_count = m.len();
-        let mut wanted_bytes = 0;
-
-        for tenant_slot in m.values() {
-            match tenant_slot {
-                TenantSlot::InProgress(_barrier) => {
-                    // While a slot is being changed, we can't know how much storage it wants.  This
-                    // means this function's output can fluctuate if a lot of changes are going on
-                    // (such as transitions from secondary to attached).
-                    //
-                    // We could wait for the barrier and retry, but it's important that the utilization
-                    // API is responsive, and the data quality impact is not very significant.
-                    continue;
-                }
-                TenantSlot::Attached(tenant) => {
-                    wanted_bytes += tenant.local_storage_wanted();
-                }
-                TenantSlot::Secondary(secondary) => {
-                    let progress = secondary.progress.lock().unwrap();
-                    wanted_bytes += if progress.heatmap_mtime.is_some() {
-                        // If we have heatmap info, then we will 'want' the sum
-                        // of the size of layers in the heatmap: this is how much space
-                        // we would use if not doing any eviction.
-                        progress.bytes_total
-                    } else {
-                        // In the absence of heatmap info, assume that the secondary location simply
-                        // needs as much space as it is currently using.
-                        secondary.resident_size_metric.get()
-                    }
-                }
-            }
-        }
-
-        Ok((wanted_bytes, shard_count as u32))
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2239,9 +2170,6 @@ pub(crate) enum GetActiveTenantError {
    /// never happen.
    #[error("Tenant is broken: {0}")]
    Broken(String),
-
-    #[error("reconnect to switch tenant id")]
-    SwitchedTenant,
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -736,13 +736,12 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Reparent this timeline to a new parent.
-    ///
-    /// A retryable step of timeline ancestor detach.
    pub(crate) async fn schedule_reparenting_and_wait(
        self: &Arc<Self>,
        new_parent: &TimelineId,
    ) -> anyhow::Result<()> {
+        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
+        // and reads the in-memory part we cannot do the detaching like this
        let receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
@@ -753,25 +752,17 @@ impl RemoteTimelineClient {
                ));
            };

-            let uploaded = &upload_queue.clean.0.metadata;
+            upload_queue.dirty.metadata.reparent(new_parent);
+            upload_queue.dirty.lineage.record_previous_ancestor(&prev);

-            if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
-                // nothing to do
-                None
-            } else {
-                upload_queue.dirty.metadata.reparent(new_parent);
-                upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            self.schedule_index_upload(upload_queue)?;

-                self.schedule_index_upload(upload_queue)?;
-
-                Some(self.schedule_barrier0(upload_queue))
-            }
+            self.schedule_barrier0(upload_queue)
        };

-        if let Some(receiver) = receiver {
-            Self::wait_completion0(receiver).await?;
-        }
-        Ok(())
+        Self::wait_completion0(receiver)
+            .await
+            .context("wait completion")
    }

    /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -787,142 +778,26 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
-                None
-            } else {
-                upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-                upload_queue.dirty.lineage.record_detaching(&adopted);
+            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+            upload_queue.dirty.lineage.record_detaching(&adopted);

-                for layer in layers {
-                    let prev = upload_queue
-                        .dirty
-                        .layer_metadata
-                        .insert(layer.layer_desc().layer_name(), layer.metadata());
-                    assert!(prev.is_none(), "copied layer existed already {layer}");
-                }
-
-                self.schedule_index_upload(upload_queue)?;
-
-                Some(self.schedule_barrier0(upload_queue))
+            for layer in layers {
+                upload_queue
+                    .dirty
+                    .layer_metadata
+                    .insert(layer.layer_desc().layer_name(), layer.metadata());
            }
+
+            self.schedule_index_upload(upload_queue)?;
+
+            let barrier = self.schedule_barrier0(upload_queue);
+            self.launch_queued_tasks(upload_queue);
+            barrier
        };

-        if let Some(barrier) = barrier {
-            Self::wait_completion0(barrier).await?;
-        }
-        Ok(())
-    }
-
-    /// Adds a gc blocking reason for this timeline if one does not exist already.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_insert_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
-                    drop(guard);
-                    panic!("cannot start detach ancestor if there is nothing to detach from");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                // Usual case: !wanted(x) && !wanted(y)
-                //
-                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
-                // turn on and off some reason.
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        // this could be avoided by having external in-memory synchronization, like
-                        // timeline detach ancestor
-                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
-                    }
-
-                    // at this point, the metadata must always show that there is a parent
-                    upload_queue.dirty.gc_blocking = current
-                        .map(|x| x.with_reason(reason))
-                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
-    }
-
-    /// Removes a gc blocking reason for this timeline if one exists.
-    ///
-    /// A retryable step of timeline detach ancestor.
-    ///
-    /// Returns a future which waits until the completion of the upload.
-    pub(crate) fn schedule_remove_gc_block_reason(
-        self: &Arc<Self>,
-        reason: index::GcBlockingReason,
-    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
-    {
-        let maybe_barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue.clean.0.lineage.is_detached_from_ancestor() {
-                    drop(guard);
-                    panic!("cannot complete timeline_ancestor_detach while not detached");
-                }
-            }
-
-            let wanted = |x: Option<&index::GcBlocking>| {
-                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
-            };
-
-            let current = upload_queue.dirty.gc_blocking.as_ref();
-            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
-
-            match (current, uploaded) {
-                (x, y) if wanted(x) && wanted(y) => None,
-                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
-                (x, y) => {
-                    if !wanted(x) && wanted(y) {
-                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
-                    }
-
-                    upload_queue.dirty.gc_blocking =
-                        current.as_ref().and_then(|x| x.without_reason(reason));
-                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
-                    // FIXME: bogus ?
-                    self.schedule_index_upload(upload_queue)?;
-                    Some(self.schedule_barrier0(upload_queue))
-                }
-            }
-        };
-
-        Ok(async move {
-            if let Some(barrier) = maybe_barrier {
-                Self::wait_completion0(barrier).await?;
-            }
-            Ok(())
-        })
+        Self::wait_completion0(barrier)
+            .await
+            .context("wait completion")
    }

    /// Launch an upload operation in the background; the file is added to be included in next
@@ -993,10 +868,7 @@ impl RemoteTimelineClient {
    ///
    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
    /// is invoked on them.
-    pub(crate) fn schedule_gc_update(
-        self: &Arc<Self>,
-        gc_layers: &[Layer],
-    ) -> Result<(), NotInitialized> {
+    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

@@ -1506,18 +1378,6 @@ impl RemoteTimelineClient {
                .dirty
                .layer_metadata
                .drain()
-                .filter(|(_file_name, meta)| {
-                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
-                    // all shards anyway, we _could_ delete these, but
-                    // - it creates a potential race if other shards are still
-                    //   using the layers while this shard deletes them.
-                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
-                    //   these timelines are present but corrupt (their index exists but some layers don't)
-                    //
-                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
-                    meta.shard.shard_number == self.tenant_shard_id.shard_number
-                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
-                })
                .map(|(file_name, meta)| {
                    remote_layer_path(
                        &self.tenant_shard_id.tenant_id,
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,9 +60,6 @@ pub struct IndexPart {
    #[serde(default)]
    pub(crate) lineage: Lineage,

-    #[serde(skip_serializing_if = "Option::is_none", default)]
-    pub(crate) gc_blocking: Option<GcBlocking>,
-
    /// Describes the kind of aux files stored in the timeline.
    ///
    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -88,11 +85,10 @@ impl IndexPart {
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
    /// - 8: added `archived_at`
-    /// - 9: +gc_blocking
-    const LATEST_VERSION: usize = 9;
+    const LATEST_VERSION: usize = 8;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -105,7 +101,6 @@ impl IndexPart {
            deleted_at: None,
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        }
    }
@@ -216,47 +211,26 @@ fn is_false(b: &bool) -> bool {
 impl Lineage {
    const REMEMBER_AT_MOST: usize = 100;

-    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
        if self.reparenting_history.last() == Some(old_ancestor) {
            // do not re-record it
-            false
-        } else {
-            #[cfg(feature = "testing")]
-            {
-                let existing = self
-                    .reparenting_history
-                    .iter()
-                    .position(|x| x == old_ancestor);
-                assert_eq!(
-                    existing, None,
-                    "we cannot reparent onto and off and onto the same timeline twice"
-                );
-            }
-            let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
-
-            self.reparenting_history_truncated |= drop_oldest;
-            if drop_oldest {
-                self.reparenting_history.remove(0);
-            }
-            self.reparenting_history.push(*old_ancestor);
-            true
+            return;
        }
+
+        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
+
+        self.reparenting_history_truncated |= drop_oldest;
+        if drop_oldest {
+            self.reparenting_history.remove(0);
+        }
+        self.reparenting_history.push(*old_ancestor);
    }

-    /// Returns true if anything changed.
-    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
-        if let Some((id, lsn, _)) = self.original_ancestor {
-            assert_eq!(
-                &(id, lsn),
-                branchpoint,
-                "detaching attempt has to be for the same ancestor we are already detached from"
-            );
-            false
-        } else {
-            self.original_ancestor =
-                Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
-            true
-        }
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
+        assert!(self.original_ancestor.is_none());
+
+        self.original_ancestor =
+            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
    }

    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
@@ -268,79 +242,15 @@ impl Lineage {
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }

-    /// Returns true if the timeline originally had an ancestor, and no longer has one.
-    pub(crate) fn is_detached_from_ancestor(&self) -> bool {
+    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
        self.original_ancestor.is_some()
    }

-    /// Returns original ancestor timeline id and lsn that this timeline has been detached from.
-    pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
-        self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
-    }
-
    pub(crate) fn is_reparented(&self) -> bool {
        !self.reparenting_history.is_empty()
    }
 }

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub(crate) struct GcBlocking {
-    pub(crate) started_at: NaiveDateTime,
-    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
-}
-
-#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
-#[enumset(serialize_repr = "list")]
-pub(crate) enum GcBlockingReason {
-    Manual,
-    DetachAncestor,
-}
-
-impl GcBlocking {
-    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
-        GcBlocking {
-            started_at: chrono::Utc::now().naive_utc(),
-            reasons: enumset::EnumSet::only(reason),
-        }
-    }
-
-    /// Returns true if the given reason is one of the reasons why the gc is blocked.
-    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
-        self.reasons.contains(reason)
-    }
-
-    /// Returns a version of self with the given reason.
-    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
-        assert!(!self.blocked_by(reason));
-        let mut reasons = self.reasons;
-        reasons.insert(reason);
-
-        Self {
-            started_at: self.started_at,
-            reasons,
-        }
-    }
-
-    /// Returns a version of self without the given reason. Assumption is that if
-    /// there are no more reasons, we can unblock the gc by returning `None`.
-    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
-        assert!(self.blocked_by(reason));
-
-        if self.reasons.len() == 1 {
-            None
-        } else {
-            let mut reasons = self.reasons;
-            assert!(reasons.remove(reason));
-            assert!(!reasons.is_empty());
-
-            Some(Self {
-                started_at: self.started_at,
-                reasons,
-            })
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -382,7 +292,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -426,7 +335,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -471,7 +379,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -519,7 +426,6 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -562,7 +468,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -608,7 +513,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -659,7 +563,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

@@ -715,7 +618,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -772,7 +674,6 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
            lineage: Default::default(),
-            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -780,68 +681,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v9_indexpart_is_parsed() {
-        let example = r#"{
-            "version": 9,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata": {
-                "disk_consistent_lsn": "0/16960E8",
-                "prev_record_lsn": "0/1696070",
-                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/1696070",
-                "initdb_lsn": "0/1696070",
-                "pg_version": 14
-            },
-            "gc_blocking": {
-                "started_at": "2024-07-19T09:00:00.123",
-                "reasons": ["DetachAncestor"]
-            }
-        }"#;
-
-        let expected = IndexPart {
-            version: 9,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::new(
-                Lsn::from_str("0/16960E8").unwrap(),
-                Some(Lsn::from_str("0/1696070").unwrap()),
-                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
-                Lsn::INVALID,
-                Lsn::from_str("0/1696070").unwrap(),
-                Lsn::from_str("0/1696070").unwrap(),
-                14,
-            ).with_recalculated_checksum().unwrap(),
-            deleted_at: None,
-            lineage: Default::default(),
-            gc_blocking: Some(GcBlocking {
-                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
-                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
-            }),
-            last_aux_file_policy: Default::default(),
-            archived_at: None,
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, pausable_failpoint, serde_system_time,
+    id::TimelineId, serde_system_time,
 };

 use super::{
@@ -1146,14 +1146,12 @@ impl<'a> TenantDownloader<'a> {
        layer: HeatMapLayer,
        ctx: &RequestContext,
    ) -> Result<Option<HeatMapLayer>, UpdateError> {
-        // Failpoints for simulating slow remote storage
+        // Failpoint for simulating slow remote storage
        failpoint_support::sleep_millis_async!(
            "secondary-layer-download-sleep",
            &self.secondary_state.cancel
        );

-        pausable_failpoint!("secondary-layer-download-pausable");
-
        let local_path = local_layer_path(
            self.conf,
            tenant_shard_id,
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,9 +8,6 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-#[cfg(test)]
-pub mod split_writer;
-
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
@@ -435,18 +432,39 @@ impl ReadableLayer {
    }
 }

+/// Return value from [`Layer::get_value_reconstruct_data`]
+#[derive(Clone, Copy, Debug)]
+pub enum ValueReconstructResult {
+    /// Got all the data needed to reconstruct the requested page
+    Complete,
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue,
+
+    /// This layer didn't contain data needed to reconstruct the page version at
+    /// the returned LSN. This is usually considered an error, but might be OK
+    /// in some circumstances.
+    Missing,
+}
+
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum LayerVisibilityHint {
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
+pub(crate) enum LayerVisibilityHint {
    /// A Visible layer might be read while serving a read, because there is not an image layer between it
    /// and a readable LSN (the tip of the branch or a child's branch point)
    Visible,
    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
+    #[allow(unused)]
    Covered,
+    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
+    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
+    /// state is for when existing layers are constructed while loading a timeline.
+    #[default]
+    Uninitialized,
 }

 pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -539,25 +557,19 @@ impl LayerAccessStats {
        self.record_residence_event_at(SystemTime::now())
    }

-    fn record_access_at(&self, now: SystemTime) -> bool {
+    pub(crate) fn record_access_at(&self, now: SystemTime) {
        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);

        // A layer which is accessed must be visible.
        mask |= 0x1 << Self::VISIBILITY_SHIFT;
        value |= 0x1 << Self::VISIBILITY_SHIFT;

-        let old_bits = self.write_bits(mask, value);
-        !matches!(
-            self.decode_visibility(old_bits),
-            LayerVisibilityHint::Visible
-        )
+        self.write_bits(mask, value);
    }

-    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
-    /// as a result of this access
-    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
+    pub(crate) fn record_access(&self, ctx: &RequestContext) {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return false;
+            return;
        }

        self.record_access_at(SystemTime::now())
@@ -614,29 +626,22 @@ impl LayerAccessStats {
        }
    }

-    /// Helper for extracting the visibility hint from the literal value of our inner u64
-    fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
-        match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
-            1 => LayerVisibilityHint::Visible,
-            0 => LayerVisibilityHint::Covered,
-            _ => unreachable!(),
-        }
-    }
-
-    /// Returns the old value which has been replaced
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
        let value = match visibility {
            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
-            LayerVisibilityHint::Covered => 0x0,
+            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
        };

-        let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
-        self.decode_visibility(old_bits)
+        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
    }

    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-        self.decode_visibility(read)
+        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+            1 => LayerVisibilityHint::Visible,
+            0 => LayerVisibilityHint::Covered,
+            _ => unreachable!(),
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,12 +36,13 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -71,7 +72,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};

 ///
 /// Header stored in the beginning of the file
@@ -196,6 +200,7 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
+    access_stats: LayerAccessStats,
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -294,6 +299,7 @@ impl DeltaLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
+        self.access_stats.record_access(ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -344,6 +350,7 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -366,6 +373,7 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
+    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
@@ -376,9 +384,6 @@ struct DeltaLayerWriterInner {
    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

    blob_writer: BlobWriter<true>,
-
-    // Number of key-lsns in the layer.
-    num_keys: usize,
 }

 impl DeltaLayerWriterInner {
@@ -412,6 +417,7 @@ impl DeltaLayerWriterInner {
        let tree_builder = DiskBtreeBuilder::new(block_buf);

        Ok(Self {
+            conf,
            path,
            timeline_id,
            tenant_shard_id,
@@ -419,7 +425,6 @@ impl DeltaLayerWriterInner {
            lsn_range,
            tree: tree_builder,
            blob_writer,
-            num_keys: 0,
        })
    }

@@ -470,9 +475,6 @@ impl DeltaLayerWriterInner {

        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
        let res = self.tree.append(&delta_key.0, blob_ref.0);
-
-        self.num_keys += 1;
-
        (val, res.map_err(|e| anyhow::anyhow!(e)))
    }

@@ -486,10 +488,11 @@ impl DeltaLayerWriterInner {
    async fn finish(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, ctx).await;
+        let result = self.finish0(key_end, timeline, ctx).await;
        if result.is_err() {
            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
            if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -502,8 +505,9 @@ impl DeltaLayerWriterInner {
    async fn finish0(
        self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -568,9 +572,11 @@ impl DeltaLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        trace!("created delta layer {}", self.path);
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        Ok((desc, self.path))
+        trace!("created delta layer {}", layer.local_path());
+
+        Ok(layer)
    }
 }

@@ -671,20 +677,14 @@ impl DeltaLayerWriter {
    pub(crate) async fn finish(
        mut self,
        key_end: Key,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(key_end, ctx).await
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
-    #[cfg(test)]
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(key_end, timeline, ctx)
+            .await
    }
 }

@@ -808,6 +808,95 @@ impl DeltaLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let mut need_image = true;
+        // Scan the page versions backwards, starting from `lsn`.
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            &block_reader,
+        );
+        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+        tree_reader
+            .visit(
+                &search_key.0,
+                VisitDirection::Backwards,
+                |key, value| {
+                    let blob_ref = BlobRef(value);
+                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                        return false;
+                    }
+                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                    if entry_lsn < lsn_range.start {
+                        return false;
+                    }
+                    offsets.push((entry_lsn, blob_ref.pos()));
+
+                    !blob_ref.will_init()
+                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
+            )
+            .await?;
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        // Ok, 'offsets' now contains the offsets of all the entries we need to read
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        for (entry_lsn, pos) in offsets {
+            cursor
+                .read_blob_into_buf(pos, &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            match val {
+                Value::Image(img) => {
+                    reconstruct_state.img = Some((entry_lsn, img));
+                    need_image = false;
+                    break;
+                }
+                Value::WalRecord(rec) => {
+                    let will_init = rec.will_init();
+                    reconstruct_state.records.push((entry_lsn, rec));
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_image = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -975,7 +1064,7 @@ impl DeltaLayerInner {
                .blobs_at
                .as_slice()
                .iter()
-                .map(|(_, (_, blob_meta))| format!("{}@{}", blob_meta.key, blob_meta.lsn))
+                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
                .join(", ");
            tracing::warn!(
                "Oversized vectored read ({} > {}) for keys {}",
@@ -1017,7 +1106,7 @@ impl DeltaLayerInner {
                Ok(blobs_buf) => blobs_buf,
                Err(err) => {
                    let kind = err.kind();
-                    for (_, (_, blob_meta)) in read.blobs_at.as_slice() {
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
                        reconstruct_state.on_key_error(
                            blob_meta.key,
                            PageReconstructError::from(anyhow!(
@@ -1580,9 +1669,8 @@ pub(crate) mod test {
    use super::*;
    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
-    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::Tenant;
    use crate::{
        context::DownloadBehavior,
        task_mgr::TaskKind,
@@ -1678,7 +1766,7 @@ pub(crate) mod test {

        let mut planned_blobs = Vec::new();
        for read in vectored_reads {
-            for (at, (_, meta)) in read.blobs_at.as_slice() {
+            for (at, meta) in read.blobs_at.as_slice() {
                planned_blobs.push(BlobSpec {
                    key: meta.key,
                    lsn: meta.lsn,
@@ -1876,8 +1964,9 @@ pub(crate) mod test {
            res?;
        }

-        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
-        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
+        let resident = writer
+            .finish(entries_meta.key_range.end, &timeline, &ctx)
+            .await?;

        let inner = resident.get_as_delta(&ctx).await?;

@@ -1957,7 +2046,6 @@ pub(crate) mod test {
            .await
            .likely_resident_layers()
            .next()
-            .cloned()
            .unwrap();

        {
@@ -2032,8 +2120,7 @@ pub(crate) mod test {
            .read()
            .await
            .likely_resident_layers()
-            .find(|&x| x != &initdb_layer)
-            .cloned()
+            .find(|x| x != &initdb_layer)
            .unwrap();

        // create a copy for the timeline, so we don't overwrite the file
@@ -2068,8 +2155,7 @@ pub(crate) mod test {
                .await
                .unwrap();

-            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
-            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
+            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();

            copied_layer.get_as_delta(ctx).await.unwrap();

@@ -2197,9 +2283,7 @@ pub(crate) mod test {
        for (key, lsn, value) in deltas {
            writer.put_value(key, lsn, value, ctx).await?;
        }
-
-        let (desc, path) = writer.finish(key_end, ctx).await?;
-        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
+        let delta_layer = writer.finish(key_end, tline, ctx).await?;

        Ok::<_, anyhow::Error>(delta_layer)
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,6 +32,9 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::{
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -134,6 +137,7 @@ pub struct ImageLayer {
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+    access_stats: LayerAccessStats,
    inner: OnceCell<ImageLayerInner>,
 }

@@ -251,6 +255,7 @@ impl ImageLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -301,6 +306,7 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -369,6 +375,9 @@ impl ImageLayerInner {
        self.lsn
    }

+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        lsn: Lsn,
@@ -420,6 +429,46 @@ impl ImageLayerInner {
        })
    }

+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader
+            .get(
+                &keybuf,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                    .build(),
+            )
+            .await?
+        {
+            let blob = block_reader
+                .block_cursor()
+                .read_blob(
+                    offset,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerValue)
+                        .build(),
+                )
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    pub(super) async fn get_values_reconstruct_data(
@@ -602,7 +651,7 @@ impl ImageLayerInner {
                    .blobs_at
                    .as_slice()
                    .iter()
-                    .map(|(_, (_, blob_meta))| format!("{}@{}", blob_meta.key, blob_meta.lsn))
+                    .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
                    .join(", ");
                tracing::warn!(
                    "Oversized vectored read ({} > {}) for keys {}",
@@ -630,7 +679,7 @@ impl ImageLayerInner {
                }
                Err(err) => {
                    let kind = err.kind();
-                    for (_, (_, blob_meta)) in read.blobs_at.as_slice() {
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
                        reconstruct_state.on_key_error(
                            blob_meta.key,
                            PageReconstructError::from(anyhow!(
@@ -693,21 +742,11 @@ struct ImageLayerWriterInner {
    // where we have chosen their compressed form
    uncompressed_bytes_chosen: u64,

-    // Number of keys in the layer.
-    num_keys: usize,
-
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
-
-    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
-    last_written_key: Key,
 }

 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
    ///
    /// Start building a new image layer.
    ///
@@ -761,8 +800,6 @@ impl ImageLayerWriterInner {
            uncompressed_bytes: 0,
            uncompressed_bytes_eligible: 0,
            uncompressed_bytes_chosen: 0,
-            num_keys: 0,
-            last_written_key: Key::MIN,
        };

        Ok(writer)
@@ -783,7 +820,6 @@ impl ImageLayerWriterInner {
        let compression = self.conf.image_compression;
        let uncompressed_len = img.len() as u64;
        self.uncompressed_bytes += uncompressed_len;
-        self.num_keys += 1;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
@@ -803,11 +839,6 @@ impl ImageLayerWriterInner {
        key.write_to_byte_slice(&mut keybuf);
        self.tree.append(&keybuf, off)?;

-        #[cfg(feature = "testing")]
-        {
-            self.last_written_key = key;
-        }
-
        Ok(())
    }

@@ -818,7 +849,6 @@ impl ImageLayerWriterInner {
        self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        end_key: Option<Key>,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -869,23 +899,11 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            self.key_range.clone(),
            self.lsn,
            metadata.len(),
        );

-        #[cfg(feature = "testing")]
-        if let Some(end_key) = end_key {
-            assert!(
-                self.last_written_key < end_key,
-                "written key violates end_key range"
-            );
-        }
-
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -962,18 +980,6 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

-    #[cfg(test)]
-    /// Estimated size of the image layer.
-    pub(crate) fn estimated_size(&self) -> u64 {
-        let inner = self.inner.as_ref().unwrap();
-        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
-    }
-
-    #[cfg(test)]
-    pub(crate) fn num_keys(&self) -> usize {
-        self.inner.as_ref().unwrap().num_keys
-    }
-
    ///
    /// Finish writing the image layer.
    ///
@@ -982,26 +988,7 @@ impl ImageLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
-    }
-
-    #[cfg(test)]
-    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
-    pub(super) async fn finish_with_end_key(
-        mut self,
-        timeline: &Arc<Timeline>,
-        end_key: Key,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
-    }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
+        self.inner.take().unwrap().finish(timeline, ctx).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,12 +10,11 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
+use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::PageReconstructError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, Result};
-use camino::Utf8PathBuf;
-use pageserver_api::key::CompactKey;
+use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -35,7 +34,8 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
 };

 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
@@ -55,6 +55,9 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    pub(crate) end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
    frozen_local_path_str: OnceLock<Arc<str>>,

@@ -79,7 +82,7 @@ pub struct InMemoryLayerInner {
    /// All versions of all pages in the layer are kept here. Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
+    index: BTreeMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -245,6 +248,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -294,6 +303,60 @@ impl InMemoryLayer {
        Ok(())
    }

+    /// Look up given value in the layer.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.start_lsn);
+        let mut need_image = true;
+
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+
+        let reader = inner.file.block_cursor();
+
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, pos) in slice.iter().rev() {
+                let buf = reader.read_blob(*pos, &ctx).await?;
+                let value = Value::des(&buf)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        // release lock on 'inner'
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -313,12 +376,8 @@ impl InMemoryLayer {
        let reader = inner.file.block_cursor();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner
-                .index
-                .range(range.start.to_compact()..range.end.to_compact())
-            {
-                let key = Key::from_compact(*key);
-                let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+            for (key, vec_map) in inner.index.range(range.start..range.end) {
+                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
                    None => self.start_lsn..end_lsn,
                };
@@ -329,18 +388,20 @@ impl InMemoryLayer {
                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
                    let buf = reader.read_blob(*pos, &ctx).await;
                    if let Err(e) = buf {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
                        break;
                    }

                    let value = Value::des(&buf.unwrap());
                    if let Err(e) = value {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
                        break;
                    }

                    let key_situation =
-                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
+                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
                    if key_situation == ValueReconstructSituation::Complete {
                        break;
                    }
@@ -388,17 +449,20 @@ impl InMemoryLayer {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
@@ -418,9 +482,10 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+
+    pub(crate) async fn put_value(
        &self,
-        key: CompactKey,
+        key: Key,
        lsn: Lsn,
        buf: &[u8],
        ctx: &RequestContext,
@@ -433,7 +498,7 @@ impl InMemoryLayer {
    async fn put_value_locked(
        &self,
        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: CompactKey,
+        key: Key,
        lsn: Lsn,
        buf: &[u8],
        ctx: &RequestContext,
@@ -483,6 +548,8 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;
+
        assert!(
            self.start_lsn < end_lsn,
            "{} >= {}",
@@ -500,13 +567,9 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        #[cfg(debug_assertions)]
-        {
-            let inner = self.inner.write().await;
-            for vec_map in inner.index.values() {
-                for (lsn, _pos) in vec_map.as_slice() {
-                    assert!(*lsn < end_lsn);
-                }
+        for vec_map in inner.index.values() {
+            for (lsn, _pos) in vec_map.as_slice() {
+                assert!(*lsn < end_lsn);
            }
        }
    }
@@ -516,12 +579,12 @@ impl InMemoryLayer {
    /// if there are no matching keys.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub async fn write_to_disk(
+    pub(crate) async fn write_to_disk(
        &self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
-        l0_flush_global_state: &l0_flush::Inner,
-    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
+    ) -> Result<Option<ResidentLayer>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -533,8 +596,9 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;

+        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
        use l0_flush::Inner;
-        let _concurrency_permit = match l0_flush_global_state {
+        let _concurrency_permit = match &*l0_flush_global_state {
            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };
@@ -542,8 +606,6 @@ impl InMemoryLayer {
        let end_lsn = *self.end_lsn.get().unwrap();

        let key_count = if let Some(key_range) = key_range {
-            let key_range = key_range.start.to_compact()..key_range.end.to_compact();
-
            inner
                .index
                .iter()
@@ -566,7 +628,7 @@ impl InMemoryLayer {
        )
        .await?;

-        match l0_flush_global_state {
+        match &*l0_flush_global_state {
            l0_flush::Inner::PageCached => {
                let ctx = RequestContextBuilder::extend(ctx)
                    .page_content_kind(PageContentKind::InMemoryLayer)
@@ -583,7 +645,7 @@ impl InMemoryLayer {
                        let will_init = Value::des(&buf)?.will_init();
                        let res;
                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx)
+                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
                            .await;
                        res?;
                    }
@@ -622,7 +684,7 @@ impl InMemoryLayer {
                        let will_init = Value::des(&buf)?.will_init();
                        let res;
                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx)
+                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
                            .await;
                        res?;
                    }
@@ -631,7 +693,7 @@ impl InMemoryLayer {
        }

        // MAX is used here because we identify L0 layers by full key range
-        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;

        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
        //
@@ -643,6 +705,6 @@ impl InMemoryLayer {
        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
        drop(_concurrency_permit);

-        Ok(Some((desc, path)))
+        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -246,7 +246,7 @@ impl Layer {
                &timeline.generation,
            );

-            LayerInner::new(
+            let layer = LayerInner::new(
                conf,
                timeline,
                local_path,
@@ -254,7 +254,14 @@ impl Layer {
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
-            )
+            );
+
+            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
+            layer
+                .access_stats
+                .set_visibility(super::LayerVisibilityHint::Visible);
+
+            layer
        }));

        let downloaded = resident.expect("just initialized");
@@ -300,6 +307,42 @@ impl Layer {
        self.0.delete_on_drop();
    }

+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from the previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use anyhow::ensure;
+
+        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
+        self.0.access_stats.record_access(ctx);
+
+        if self.layer_desc().is_delta {
+            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
+            ensure!(self.layer_desc().key_range.contains(&key));
+        } else {
+            ensure!(self.layer_desc().key_range.contains(&key));
+            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
+            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
+        }
+
+        layer
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
+            .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
+    }
+
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -316,7 +359,7 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.record_access(ctx);
+        self.0.access_stats.record_access(ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,18 +439,18 @@ impl Layer {
        self.0.info(reset)
    }

-    pub(crate) fn latest_activity(&self) -> SystemTime {
-        self.0.access_stats.latest_activity()
-    }
-
-    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        self.0.access_stats.visibility()
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.0.access_stats
    }

    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.0.path
    }

+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -450,57 +493,13 @@ impl Layer {
            }
        }
    }
-
-    fn record_access(&self, ctx: &RequestContext) {
-        if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
-            if let Some(tl) = self.0.timeline.upgrade() {
-                tl.metrics
-                    .visible_physical_size_gauge
-                    .add(self.0.desc.file_size)
-            }
-        }
-    }
-
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
-        use LayerVisibilityHint::*;
-        match (old_visibility, visibility) {
-            (Visible, Covered) => {
-                // Subtract this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    debug_assert!(
-                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
-                    );
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .sub(self.0.desc.file_size)
-                }
-            }
-            (Covered, Visible) => {
-                // Add this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .add(self.0.desc.file_size)
-                }
-            }
-            (Covered, Covered) | (Visible, Visible) => {
-                // no change
-            }
-        }
-    }
 }

 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_values_reconstruct_data`].
+/// read with [`Layer::get_value_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -581,6 +580,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -691,16 +693,6 @@ impl Drop for LayerInner {
                timeline.metrics.layer_count_image.dec();
                timeline.metrics.layer_size_image.sub(self.desc.file_size);
            }
-
-            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
-                debug_assert!(
-                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
-                );
-                timeline
-                    .metrics
-                    .visible_physical_size_gauge
-                    .sub(self.desc.file_size);
-            }
        }

        if !*self.wanted_deleted.get_mut() {
@@ -809,14 +801,11 @@ impl LayerInner {
            timeline.metrics.layer_size_image.add(desc.file_size);
        }

-        // New layers are visible by default. This metric is later updated on drop or in set_visibility
-        timeline
-            .metrics
-            .visible_physical_size_gauge
-            .add(desc.file_size);
-
        LayerInner {
            conf,
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1737,6 +1726,28 @@ impl DownloadedLayer {
            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
    }

+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => {
+                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
+                    .await
+            }
+        }
+    }
+
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -1835,7 +1846,7 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                self.owner.record_access(ctx);
+                owner.access_stats.record_access(ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
@@ -1848,8 +1859,8 @@ impl ResidentLayer {
    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
    /// the provided writer.  Return the number of keys written.
    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
-    pub(crate) async fn filter(
-        &self,
+    pub(crate) async fn filter<'a>(
+        &'a self,
        shard_identity: &ShardIdentity,
        writer: &mut ImageLayerWriter,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -39,7 +39,7 @@ async fn smoke_test() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -50,26 +50,13 @@ async fn smoke_test() {
    // all layers created at pageserver are like `layer`, initialized with strong
    // Arc<DownloadedLayer>.

-    let controlfile_keyspace = KeySpace {
-        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
-    };
-
    let img_before = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
+        data.img
            .take()
            .expect("tenant harness writes the control file")
    };
@@ -87,24 +74,13 @@ async fn smoke_test() {

    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValueReconstructState::default();
        layer
-            .get_values_reconstruct_data(
-                controlfile_keyspace.clone(),
-                Lsn(0x10)..Lsn(0x11),
-                &mut data,
-                &ctx,
-            )
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
            .instrument(download_span.clone())
            .await
            .unwrap();
-        data.keys
-            .remove(&CONTROLFILE_KEY)
-            .expect("must be present")
-            .expect("should not error")
-            .img
-            .take()
-            .expect("tenant harness writes the control file")
+        data.img.take().unwrap()
    };

    assert_eq!(img_before, img_after);
@@ -176,7 +152,7 @@ async fn smoke_test() {
    {
        let layers = &[layer];
        let mut g = timeline.layers.write().await;
-        g.open_mut().unwrap().finish_gc_timeline(layers);
+        g.finish_gc_timeline(layers);
        // this just updates the remote_physical_size for demonstration purposes
        rtc.schedule_gc_update(layers).unwrap();
    }
@@ -216,7 +192,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -260,7 +236,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    // the deletion of the layer in remote_storage happens.
    {
        let mut layers = timeline.layers.write().await;
-        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
+        layers.finish_gc_timeline(&[layer]);
    }

    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -301,7 +277,7 @@ fn read_wins_pending_eviction() {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+                layers.likely_resident_layers().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -433,7 +409,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+                layers.likely_resident_layers().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -602,7 +578,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -682,7 +658,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -801,9 +777,9 @@ async fn eviction_cancellation_on_drop() {
    let (evicted_layer, not_evicted) = {
        let mut layers = {
            let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
+            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
            // remove the layers from layermap
-            guard.open_mut().unwrap().finish_gc_timeline(&layers);
+            guard.finish_gc_timeline(&layers);

            layers
        };
@@ -854,7 +830,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
    assert_eq!(size_of::<LayerAccessStats>(), 8);
    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 296);
+    assert_eq!(size_of::<LayerInner>(), 312);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -41,20 +41,6 @@ pub struct PersistentLayerKey {
    pub is_delta: bool,
 }

-impl std::fmt::Display for PersistentLayerKey {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}..{} {}..{} is_delta={}",
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta
-        )
-    }
-}
-
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,454 +0,0 @@
-use std::{ops::Range, sync::Arc};
-
-use bytes::Bytes;
-use pageserver_api::key::{Key, KEY_SIZE};
-use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
-
-use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
-
-use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
-
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
-#[must_use]
-pub struct SplitImageLayerWriter {
-    inner: ImageLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn: Lsn,
-}
-
-impl SplitImageLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn: Lsn,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: ImageLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                &(start_key..Key::MAX),
-                lsn,
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn,
-        })
-    }
-
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is an upper bound of the space that the key/image could take
-        // because we did not consider compression in this estimation. The resulting image layer
-        // could be smaller than the target size.
-        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_image_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                &(key..Key::MAX),
-                self.lsn,
-                ctx,
-            )
-            .await?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
-        }
-        self.inner.put_image(key, img, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
-#[must_use]
-pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
-    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
-    conf: &'static PageServerConf,
-    timeline_id: TimelineId,
-    tenant_shard_id: TenantShardId,
-    lsn_range: Range<Lsn>,
-}
-
-impl SplitDeltaLayerWriter {
-    pub async fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_shard_id: TenantShardId,
-        start_key: Key,
-        lsn_range: Range<Lsn>,
-        target_layer_size: u64,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
-            generated_layers: Vec::new(),
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn_range,
-        })
-    }
-
-    pub async fn put_value(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_delta_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                key,
-                self.lsn_range.clone(),
-                ctx,
-            )
-            .await?;
-            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            self.generated_layers.push(delta_layer);
-        }
-        self.inner.put_value(key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn finish(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-
-        let (desc, path) = inner.finish(end_key, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-        generated_layers.push(delta_layer);
-        Ok(generated_layers)
-    }
-
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::AsLayerDesc,
-        },
-        DEFAULT_PG_VERSION,
-    };
-
-    use super::*;
-
-    fn get_key(id: u32) -> Key {
-        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        key.field6 = id;
-        key
-    }
-
-    fn get_img(id: u32) -> Bytes {
-        format!("{id:064}").into()
-    }
-
-    fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
-    }
-
-    #[tokio::test]
-    async fn write_one_image() {
-        let harness = TenantHarness::create("split_writer_write_one_image")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 1);
-    }
-
-    #[tokio::test]
-    async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-        const N: usize = 2000;
-        for i in 0..N {
-            let i = i as u32;
-            image_writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
-                .await
-                .unwrap();
-            delta_writer
-                .put_value(
-                    get_key(i),
-                    Lsn(0x20),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                )
-                .await
-                .unwrap();
-        }
-        let image_layers = image_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].layer_desc().key_range.end,
-                    image_layers[idx].layer_desc().key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].layer_desc().key_range.end,
-                    delta_layers[idx].layer_desc().key_range.start
-                );
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn write_large_img() {
-        let harness = TenantHarness::create("split_writer_write_large_img")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        let mut image_writer = SplitImageLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x18)..Lsn(0x20),
-            4 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
-            .await
-            .unwrap();
-        image_writer
-            .put_image(get_key(1), get_large_img(), &tline, &ctx)
-            .await
-            .unwrap();
-        let layers = image_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-
-        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        delta_writer
-            .put_value(
-                get_key(1),
-                Lsn(0x1A),
-                Value::Image(get_large_img()),
-                &tline,
-                &ctx,
-            )
-            .await
-            .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
-        assert_eq!(layers.len(), 2);
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,11 +211,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run compaction
                match tenant.compaction_iteration(&cancel, &ctx).await {
-                    Ok(has_pending_task) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::ZERO } else { period }
-                    }
                    Err(e) => {
                        let wait_duration = backoff::exponential_backoff_duration_seconds(
                            error_run_count + 1,
@@ -232,6 +227,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        );
                        wait_duration
                    }
+                    Ok(has_pending_task) => {
+                        error_run_count = 0;
+                        // schedule the next compaction immediately in case there is a pending compaction task
+                        if has_pending_task { Duration::from_secs(0) } else { period }
+                    }
                }
            };

@@ -265,8 +265,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    count_throttled,
                    sum_throttled_usecs,
                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
+                    "shard was throttled in the last n_seconds")
            });

            // Sleep
@@ -366,13 +365,14 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            if first {
                first = false;

-                let delays = async {
-                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
-                    random_init_delay(period, &cancel).await?;
-                    Ok::<_, Cancelled>(())
-                };
+                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
+                    .await
+                    .is_err()
+                {
+                    break;
+                }

-                if delays.await.is_err() {
+                if random_init_delay(period, &cancel).await.is_err() {
                    break;
                }
            }
@@ -407,16 +407,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        error_run_count += 1;
                        let wait_duration = Duration::from_secs_f64(wait_duration);

-                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
-                            // Timeline was cancelled during gc. We might either be in an event
-                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
-                            // or in one that affects the timeline only (timeline deletion).
-                            // Therefore, don't exit the loop.
-                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        } else {
-                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        }
-
+                        error!(
+                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    );
                        wait_duration
                    }
                }
@@ -424,6 +417,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

+            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
                .is_ok()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,19 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    // Always ensure the lock order is compaction -> gc.
-    let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
-        compaction_lock,
-        "acquires compaction lock",
-        std::time::Duration::from_secs(5),
-    )
-    .await;
-
-    let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
-        gc_lock,
-        "acquires gc lock",
+    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
+    let guards = crate::timed(
+        guards,
+        "acquire gc and compaction locks",
        std::time::Duration::from_secs(5),
    )
    .await;
@@ -116,8 +107,7 @@ pub(super) async fn delete_local_timeline_directory(
        .context("fsync_pre_mark_remove")?;

    info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
+    drop(guards);

    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -216,10 +206,11 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all)]
+    #[instrument(skip_all, fields(%inplace))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
+        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();

@@ -230,8 +221,6 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

-        tenant.gc_block.before_delete(&timeline);
-
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
@@ -246,7 +235,11 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,20 +1,16 @@
-use std::{collections::HashSet, sync::Arc};
+use std::sync::Arc;

 use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
    context::{DownloadBehavior, RequestContext},
    task_mgr::TaskKind,
    tenant::{
-        mgr::GetActiveTenantError,
-        remote_timeline_client::index::GcBlockingReason::DetachAncestor,
        storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
        Tenant,
    },
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
-use anyhow::Context;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
-use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -42,12 +38,6 @@ pub(crate) enum Error {
    #[error("remote copying layer failed")]
    CopyFailed(#[source] anyhow::Error),

-    #[error("wait for tenant to activate after restarting")]
-    WaitToActivate(#[source] GetActiveTenantError),
-
-    #[error("detached timeline was not found after restart")]
-    DetachedNotFoundAfterRestart,
-
    #[error("unexpected error")]
    Unexpected(#[source] anyhow::Error),

@@ -65,10 +55,6 @@ impl From<Error> for ApiError {
            Error::OtherTimelineDetachOngoing(_) => {
                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
            }
-            e @ Error::WaitToActivate(_) => {
-                let s = utils::error::report_compact_sources(&e).to_string();
-                ApiError::ResourceUnavailable(s.into())
-            }
            // All of these contain shutdown errors, in fact, it's the most common
            e @ Error::FlushAncestor(_)
            | e @ Error::RewrittenDeltaDownloadFailed(_)
@@ -77,7 +63,6 @@ impl From<Error> for ApiError {
            | e @ Error::CopyFailed(_)
            | e @ Error::Unexpected(_)
            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
-            Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
        }
    }
 }
@@ -89,11 +74,6 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
        Error::ShuttingDown
    }
 }
-impl From<super::layer_manager::Shutdown> for Error {
-    fn from(_: super::layer_manager::Shutdown) -> Self {
-        Error::ShuttingDown
-    }
-}

 impl From<FlushLayerError> for Error {
    fn from(value: FlushLayerError) -> Self {
@@ -111,25 +91,8 @@ impl From<FlushLayerError> for Error {
    }
 }

-impl From<GetActiveTenantError> for Error {
-    fn from(value: GetActiveTenantError) -> Self {
-        use pageserver_api::models::TenantState;
-        use GetActiveTenantError::*;
-
-        match value {
-            Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => {
-                Error::ShuttingDown
-            }
-            WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => {
-                // NotFound seems out-of-place
-                Error::WaitToActivate(value)
-            }
-        }
-    }
-}
-
 pub(crate) enum Progress {
-    Prepared(Attempt, PreparedTimelineDetach),
+    Prepared(completion::Completion, PreparedTimelineDetach),
    Done(AncestorDetached),
 }

@@ -153,26 +116,6 @@ impl Default for Options {
    }
 }

-/// Represents an across tenant reset exclusive single attempt to detach ancestor.
-#[derive(Debug)]
-pub(crate) struct Attempt {
-    pub(crate) timeline_id: TimelineId,
-
-    _guard: completion::Completion,
-    gate_entered: Option<utils::sync::gate::GateGuard>,
-}
-
-impl Attempt {
-    pub(crate) fn before_reset_tenant(&mut self) {
-        let taken = self.gate_entered.take();
-        assert!(taken.is_some());
-    }
-
-    pub(crate) fn new_barrier(&self) -> completion::Barrier {
-        self._guard.barrier()
-    }
-}
-
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
@@ -187,38 +130,61 @@ pub(super) async fn prepare(
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
-        let still_in_progress = {
+        {
            let accessor = detached.remote_client.initialized_upload_queue()?;

            // we are safe to inspect the latest uploaded, because we can only witness this after
            // restart is complete and ancestor is no more.
            let latest = accessor.latest_uploaded_index_part();
-            if latest.lineage.detached_previous_ancestor().is_none() {
+            if !latest.lineage.is_detached_from_original_ancestor() {
                return Err(NoAncestor);
-            };
-
-            latest
-                .gc_blocking
-                .as_ref()
-                .is_some_and(|b| b.blocked_by(DetachAncestor))
-        };
-
-        if still_in_progress {
-            // gc is still blocked, we can still reparent and complete.
-            // we are safe to reparent remaining, because they were locked in in the beginning.
-            let attempt = continue_with_blocked_gc(detached, tenant).await?;
-
-            // because the ancestor of detached is already set to none, we have published all
-            // of the layers, so we are still "prepared."
-            return Ok(Progress::Prepared(
-                attempt,
-                PreparedTimelineDetach { layers: Vec::new() },
-            ));
+            }
        }

-        let reparented_timelines = reparented_direct_children(detached, tenant)?;
+        // detached has previously been detached; let's inspect each of the current timelines and
+        // report back the timelines which have been reparented by our detach
+        let mut all_direct_children = tenant
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
+            .map(|tl| (tl.ancestor_lsn, tl.clone()))
+            .collect::<Vec<_>>();
+
+        let mut any_shutdown = false;
+
+        all_direct_children.retain(
+            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
+                Ok(accessor) => accessor
+                    .latest_uploaded_index_part()
+                    .lineage
+                    .is_reparented(),
+                Err(_shutdownalike) => {
+                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
+                    // sharded enviroment.
+                    any_shutdown = true;
+                    true
+                }
+            },
+        );
+
+        if any_shutdown {
+            // it could be one or many being deleted; have client retry
+            return Err(Error::ShuttingDown);
+        }
+
+        let mut reparented = all_direct_children;
+        // why this instead of hashset? there is a reason, but I've forgotten it many times.
+        //
+        // maybe if this was a hashset we would not be able to distinguish some race condition.
+        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
+
        return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines,
+            reparented_timelines: reparented
+                .into_iter()
+                .map(|(_, tl)| tl.timeline_id)
+                .collect(),
        }));
    };

@@ -234,7 +200,22 @@ pub(super) async fn prepare(
        return Err(TooManyAncestors);
    }

-    let attempt = start_new_attempt(detached, tenant).await?;
+    // before we acquire the gate, we must mark the ancestor as having a detach operation
+    // ongoing which will block other concurrent detach operations so we don't get to ackward
+    // situations where there would be two branches trying to reparent earlier branches.
+    let (guard, barrier) = completion::channel();
+
+    {
+        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
+        if let Some((tl, other)) = guard.as_ref() {
+            if !other.is_ready() {
+                return Err(OtherTimelineDetachOngoing(*tl));
+            }
+        }
+        *guard = Some((detached.timeline_id, barrier));
+    }
+
+    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;

    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");

@@ -296,12 +277,11 @@ pub(super) async fn prepare(

        // between retries, these can change if compaction or gc ran in between. this will mean
        // we have to redo work.
-        partition_work(ancestor_lsn, &layers)?
+        partition_work(ancestor_lsn, &layers)
    };

    // TODO: layers are already sorted by something: use that to determine how much of remote
-    // copies are already done -- gc is blocked, but a compaction could had happened on ancestor,
-    // which is something to keep in mind if copy skipping is implemented.
+    // copies are already done.
    tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");

    // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
@@ -315,33 +295,29 @@ pub(super) async fn prepare(

        let mut wrote_any = false;

-        let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get()));
+        let limiter = Arc::new(tokio::sync::Semaphore::new(
+            options.rewrite_concurrency.get(),
+        ));

        for layer in straddling_branchpoint {
            let limiter = limiter.clone();
            let timeline = detached.clone();
            let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);

-            let span = tracing::info_span!("upload_rewritten_layer", %layer);
-            tasks.spawn(
-                async move {
-                    let _permit = limiter.acquire().await;
-                    let copied =
-                        upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
-                            .await?;
-                    if let Some(copied) = copied.as_ref() {
-                        tracing::info!(%copied, "rewrote and uploaded");
-                    }
-                    Ok(copied)
-                }
-                .instrument(span),
-            );
+            tasks.spawn(async move {
+                let _permit = limiter.acquire().await;
+                let copied =
+                    upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
+                        .await?;
+                Ok(copied)
+            });
        }

        while let Some(res) = tasks.join_next().await {
            match res {
                Ok(Ok(Some(copied))) => {
                    wrote_any = true;
+                    tracing::info!(layer=%copied, "rewrote and uploaded");
                    new_layers.push(copied);
                }
                Ok(Ok(None)) => {}
@@ -368,7 +344,7 @@ pub(super) async fn prepare(
    }

    let mut tasks = tokio::task::JoinSet::new();
-    let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
+    let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));

    for adopted in rest_of_historic {
        let limiter = limiter.clone();
@@ -402,119 +378,19 @@ pub(super) async fn prepare(

    let prepared = PreparedTimelineDetach { layers: new_layers };

-    Ok(Progress::Prepared(attempt, prepared))
-}
-
-async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
-    let attempt = obtain_exclusive_attempt(detached, tenant)?;
-
-    // insert the block in the index_part.json, if not already there.
-    let _dont_care = tenant
-        .gc_block
-        .insert(
-            detached,
-            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
-        )
-        .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
-
-    Ok(attempt)
-}
-
-async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
-    // FIXME: it would be nice to confirm that there is an in-memory version, since we've just
-    // verified there is a persistent one?
-    obtain_exclusive_attempt(detached, tenant)
-}
-
-fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
-    use Error::{OtherTimelineDetachOngoing, ShuttingDown};
-
-    // ensure we are the only active attempt for this tenant
-    let (guard, barrier) = completion::channel();
-    {
-        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
-        if let Some((tl, other)) = guard.as_ref() {
-            if !other.is_ready() {
-                return Err(OtherTimelineDetachOngoing(*tl));
-            }
-            // FIXME: no test enters here
-        }
-        *guard = Some((detached.timeline_id, barrier));
-    }
-
-    // ensure the gate is still open
-    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
-
-    Ok(Attempt {
-        timeline_id: detached.timeline_id,
-        _guard: guard,
-        gate_entered: Some(_gate_entered),
-    })
-}
-
-fn reparented_direct_children(
-    detached: &Arc<Timeline>,
-    tenant: &Tenant,
-) -> Result<HashSet<TimelineId>, Error> {
-    let mut all_direct_children = tenant
-        .timelines
-        .lock()
-        .unwrap()
-        .values()
-        .filter_map(|tl| {
-            let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
-
-            if is_direct_child {
-                Some(tl.clone())
-            } else {
-                if let Some(timeline) = tl.ancestor_timeline.as_ref() {
-                    assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
-                }
-                None
-            }
-        })
-        // Collect to avoid lock taking order problem with Tenant::timelines and
-        // Timeline::remote_client
-        .collect::<Vec<_>>();
-
-    let mut any_shutdown = false;
-
-    all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
-        Ok(accessor) => accessor
-            .latest_uploaded_index_part()
-            .lineage
-            .is_reparented(),
-        Err(_shutdownalike) => {
-            // not 100% a shutdown, but let's bail early not to give inconsistent results in
-            // sharded enviroment.
-            any_shutdown = true;
-            true
-        }
-    });
-
-    if any_shutdown {
-        // it could be one or many being deleted; have client retry
-        return Err(Error::ShuttingDown);
-    }
-
-    Ok(all_direct_children
-        .into_iter()
-        .map(|tl| tl.timeline_id)
-        .collect())
+    Ok(Progress::Prepared(guard, prepared))
 }

 fn partition_work(
    ancestor_lsn: Lsn,
-    source: &LayerManager,
-) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> {
+    source_layermap: &LayerManager,
+) -> (usize, Vec<Layer>, Vec<Layer>) {
    let mut straddling_branchpoint = vec![];
    let mut rest_of_historic = vec![];

    let mut later_by_lsn = 0;

-    for desc in source.layer_map()?.iter_historic_layers() {
+    for desc in source_layermap.layer_map().iter_historic_layers() {
        // off by one chances here:
        // - start is inclusive
        // - end is exclusive
@@ -533,10 +409,10 @@ fn partition_work(
            &mut rest_of_historic
        };

-        target.push(source.get_from_desc(&desc));
+        target.push(source_layermap.get_from_desc(&desc));
    }

-    Ok((later_by_lsn, straddling_branchpoint, rest_of_historic))
+    (later_by_lsn, straddling_branchpoint, rest_of_historic)
 }

 async fn upload_rewritten_layer(
@@ -612,12 +488,10 @@ async fn copy_lsn_prefix(
        // reuse the key instead of adding more holes between layers by using the real
        // highest key in the layer.
        let reused_highest_key = layer.layer_desc().key_range.end;
-        let (desc, path) = writer
-            .finish(reused_highest_key, ctx)
+        let copied = writer
+            .finish(reused_highest_key, target_timeline, ctx)
            .await
            .map_err(CopyDeltaPrefix)?;
-        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
-            .map_err(CopyDeltaPrefix)?;

        tracing::debug!(%layer, %copied, "new layer produced");

@@ -657,311 +531,131 @@ async fn remote_copy(
        .map_err(CopyFailed)
 }

-pub(crate) enum DetachingAndReparenting {
-    /// All of the following timeline ids were reparented and the timeline ancestor detach must be
-    /// marked as completed.
-    Reparented(HashSet<TimelineId>),
-
-    /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as
-    /// completed.
-    ///
-    /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made.
-    SomeReparentingFailed { must_reset_tenant: bool },
-
-    /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach
-    /// must be marked as completed.
-    AlreadyDone(HashSet<TimelineId>),
-}
-
-impl DetachingAndReparenting {
-    pub(crate) fn reset_tenant_required(&self) -> bool {
-        use DetachingAndReparenting::*;
-        match self {
-            Reparented(_) => true,
-            SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant,
-            AlreadyDone(_) => false,
-        }
-    }
-
-    pub(crate) fn completed(self) -> Option<HashSet<TimelineId>> {
-        use DetachingAndReparenting::*;
-        match self {
-            Reparented(x) | AlreadyDone(x) => Some(x),
-            SomeReparentingFailed { .. } => None,
-        }
-    }
-}
-
-/// See [`Timeline::detach_from_ancestor_and_reparent`].
-pub(super) async fn detach_and_reparent(
+/// See [`Timeline::complete_detaching_timeline_ancestor`].
+pub(super) async fn complete(
    detached: &Arc<Timeline>,
    tenant: &Tenant,
    prepared: PreparedTimelineDetach,
    _ctx: &RequestContext,
-) -> Result<DetachingAndReparenting, anyhow::Error> {
+) -> Result<Vec<TimelineId>, anyhow::Error> {
    let PreparedTimelineDetach { layers } = prepared;

-    #[derive(Debug)]
-    enum Ancestor {
-        NotDetached(Arc<Timeline>, Lsn),
-        Detached(Arc<Timeline>, Lsn),
-    }
-
-    let (recorded_branchpoint, still_ongoing) = {
-        let access = detached.remote_client.initialized_upload_queue()?;
-        let latest = access.latest_uploaded_index_part();
-
-        (
-            latest.lineage.detached_previous_ancestor(),
-            latest
-                .gc_blocking
-                .as_ref()
-                .is_some_and(|b| b.blocked_by(DetachAncestor)),
-        )
-    };
-    assert!(
-        still_ongoing,
-        "cannot (detach? reparent)? complete if the operation is not still ongoing"
-    );
-
-    let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) {
-        (Some(ancestor), None) => {
-            assert!(
-                !layers.is_empty(),
-                "there should always be at least one layer to inherit"
-            );
-            Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn)
-        }
-        (Some(_), Some(_)) => {
-            panic!(
-                "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None"
-            );
-        }
-        (None, Some((ancestor_id, ancestor_lsn))) => {
-            // it has been either:
-            // - detached but still exists => we can try reparenting
-            // - detached and deleted
-            //
-            // either way, we must complete
-            assert!(
-                layers.is_empty(),
-                "no layers should had been copied as detach is done"
-            );
-
-            let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned();
-
-            if let Some(ancestor) = existing {
-                Ancestor::Detached(ancestor, ancestor_lsn)
-            } else {
-                let direct_children = reparented_direct_children(detached, tenant)?;
-                return Ok(DetachingAndReparenting::AlreadyDone(direct_children));
-            }
-        }
-        (None, None) => {
-            // TODO: make sure there are no `?` before tenant_reset from after a questionmark from
-            // here.
-            panic!(
-            "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor"
-            );
-        }
-    };
+    let ancestor = detached
+        .get_ancestor_timeline()
+        .expect("must still have a ancestor");
+    let ancestor_lsn = detached.get_ancestor_lsn();

    // publish the prepared layers before we reparent any of the timelines, so that on restart
    // reparented timelines find layers. also do the actual detaching.
    //
-    // if we crash after this operation, a retry will allow reparenting the remaining timelines as
-    // gc is blocked.
-
-    let (ancestor, ancestor_lsn, was_detached) = match ancestor {
-        Ancestor::NotDetached(ancestor, ancestor_lsn) => {
-            // this has to complete before any reparentings because otherwise they would not have
-            // layers on the new parent.
-            detached
-                .remote_client
-                .schedule_adding_existing_layers_to_index_detach_and_wait(
-                    &layers,
-                    (ancestor.timeline_id, ancestor_lsn),
-                )
-                .await
-                .context("publish layers and detach ancestor")?;
-
-            tracing::info!(
-                ancestor=%ancestor.timeline_id,
-                %ancestor_lsn,
-                inherited_layers=%layers.len(),
-                "detached from ancestor"
-            );
-            (ancestor, ancestor_lsn, true)
-        }
-        Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false),
-    };
+    // if we crash after this operation, we will at least come up having detached a timeline, but
+    // we cannot go back and reparent the timelines which would had been reparented in normal
+    // execution.
+    //
+    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
+    // which could give us a completely wrong layer combination.
+    detached
+        .remote_client
+        .schedule_adding_existing_layers_to_index_detach_and_wait(
+            &layers,
+            (ancestor.timeline_id, ancestor_lsn),
+        )
+        .await?;

    let mut tasks = tokio::task::JoinSet::new();

-    // Returns a single permit semaphore which will be used to make one reparenting succeed,
-    // others will fail as if those timelines had been stopped for whatever reason.
-    #[cfg(feature = "testing")]
-    let failpoint_sem = || -> Option<Arc<Semaphore>> {
-        fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some(
-            Arc::new(Semaphore::new(1))
-        ));
-        None
-    }();
-
    // because we are now keeping the slot in progress, it is unlikely that there will be any
    // timeline deletions during this time. if we raced one, then we'll just ignore it.
-    {
-        let g = tenant.timelines.lock().unwrap();
-        reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn)
-            .cloned()
-            .for_each(|timeline| {
-                // important in this scope: we are holding the Tenant::timelines lock
-                let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
-                let new_parent = detached.timeline_id;
-                #[cfg(feature = "testing")]
-                let failpoint_sem = failpoint_sem.clone();
+    tenant
+        .timelines
+        .lock()
+        .unwrap()
+        .values()
+        .filter_map(|tl| {
+            if Arc::ptr_eq(tl, detached) {
+                return None;
+            }

-                tasks.spawn(
-                    async move {
-                        let res = async {
-                            #[cfg(feature = "testing")]
-                            if let Some(failpoint_sem) = failpoint_sem {
-                                let _permit = failpoint_sem.acquire().await.map_err(|_| {
-                                    anyhow::anyhow!(
-                                        "failpoint: timeline-detach-ancestor::allow_one_reparented",
-                                    )
-                                })?;
-                                failpoint_sem.close();
-                            }
+            if !tl.is_active() {
+                return None;
+            }

-                            timeline
-                                .remote_client
-                                .schedule_reparenting_and_wait(&new_parent)
-                                .await
-                        }
+            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
+            let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
+            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
+
+            let is_deleting = tl
+                .delete_progress
+                .try_lock()
+                .map(|flow| !flow.is_not_started())
+                .unwrap_or(true);
+
+            if is_same && is_earlier && !is_deleting {
+                Some(tl.clone())
+            } else {
+                None
+            }
+        })
+        .for_each(|timeline| {
+            // important in this scope: we are holding the Tenant::timelines lock
+            let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
+            let new_parent = detached.timeline_id;
+
+            tasks.spawn(
+                async move {
+                    let res = timeline
+                        .remote_client
+                        .schedule_reparenting_and_wait(&new_parent)
                        .await;

-                        match res {
-                            Ok(()) => {
-                                tracing::info!("reparented");
-                                Some(timeline)
-                            }
-                            Err(e) => {
-                                // with the use of tenant slot, raced timeline deletion is the most
-                                // likely reason.
-                                tracing::warn!("reparenting failed: {e:#}");
-                                None
-                            }
+                    match res {
+                        Ok(()) => Some(timeline),
+                        Err(e) => {
+                            // with the use of tenant slot, we no longer expect these.
+                            tracing::warn!("reparenting failed: {e:#}");
+                            None
                        }
                    }
-                    .instrument(span),
-                );
-            });
-    }
+                }
+                .instrument(span),
+            );
+        });

    let reparenting_candidates = tasks.len();
-    let mut reparented = HashSet::with_capacity(tasks.len());
+    let mut reparented = Vec::with_capacity(tasks.len());

    while let Some(res) = tasks.join_next().await {
        match res {
            Ok(Some(timeline)) => {
-                assert!(
-                    reparented.insert(timeline.timeline_id),
-                    "duplicate reparenting? timeline_id={}",
-                    timeline.timeline_id
-                );
+                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
+                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
+            }
+            Ok(None) => {
+                // lets just ignore this for now. one or all reparented timelines could had
+                // started deletion, and that is fine.
            }
            Err(je) if je.is_cancelled() => unreachable!("not used"),
-            // just ignore failures now, we can retry
-            Ok(None) => {}
-            Err(je) if je.is_panic() => {}
+            Err(je) if je.is_panic() => {
+                // ignore; it's better to continue with a single reparenting failing (or even
+                // all of them) in order to get to the goal state.
+                //
+                // these timelines will never be reparentable, but they can be always detached as
+                // separate tree roots.
+            }
            Err(je) => tracing::error!("unexpected join error: {je:?}"),
        }
    }

-    let reparented_all = reparenting_candidates == reparented.len();
-
-    if reparented_all {
-        Ok(DetachingAndReparenting::Reparented(reparented))
-    } else {
-        tracing::info!(
-            reparented = reparented.len(),
-            candidates = reparenting_candidates,
-            "failed to reparent all candidates; they can be retried after the tenant_reset",
-        );
-
-        let must_reset_tenant = !reparented.is_empty() || was_detached;
-        Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant })
-    }
-}
-
-pub(super) async fn complete(
-    detached: &Arc<Timeline>,
-    tenant: &Tenant,
-    mut attempt: Attempt,
-    _ctx: &RequestContext,
-) -> Result<(), Error> {
-    assert_eq!(detached.timeline_id, attempt.timeline_id);
-
-    if attempt.gate_entered.is_none() {
-        let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?;
-        attempt.gate_entered = Some(entered);
-    } else {
-        // Some(gate_entered) means the tenant was not restarted, as is not required
+    if reparenting_candidates != reparented.len() {
+        tracing::info!("failed to reparent some candidates");
    }

-    assert!(detached.ancestor_timeline.is_none());
+    reparented.sort_unstable();

-    // this should be an 503 at least...?
-    fail::fail_point!(
-        "timeline-detach-ancestor::complete_before_uploading",
-        |_| Err(Error::Failpoint(
-            "timeline-detach-ancestor::complete_before_uploading"
-        ))
-    );
+    let reparented = reparented
+        .into_iter()
+        .map(|(_, timeline_id)| timeline_id)
+        .collect();

-    tenant
-        .gc_block
-        .remove(
-            detached,
-            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
-        )
-        .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
-
-    Ok(())
-}
-
-/// Query against a locked `Tenant::timelines`.
-fn reparentable_timelines<'a, I>(
-    timelines: I,
-    detached: &'a Arc<Timeline>,
-    ancestor: &'a Arc<Timeline>,
-    ancestor_lsn: Lsn,
-) -> impl Iterator<Item = &'a Arc<Timeline>> + 'a
-where
-    I: Iterator<Item = &'a Arc<Timeline>> + 'a,
-{
-    timelines.filter_map(move |tl| {
-        if Arc::ptr_eq(tl, detached) {
-            return None;
-        }
-
-        let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-        let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
-        let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
-
-        let is_deleting = tl
-            .delete_progress
-            .try_lock()
-            .map(|flow| !flow.is_not_started())
-            .unwrap_or(true);
-
-        if is_same && is_earlier && !is_deleting {
-            Some(tl)
-        } else {
-            None
-        }
-    })
+    Ok(reparented)
 }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -213,45 +213,51 @@ impl Timeline {
        let mut js = tokio::task::JoinSet::new();
        {
            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            for layer in layers.iter_historic_layers() {
+                let layer = guard.get_from_desc(&layer);

-            guard
-                .likely_resident_layers()
-                .filter(|layer| {
-                    let last_activity_ts = layer.latest_activity();
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.

-                    let no_activity_for = match now.duration_since(last_activity_ts) {
-                        Ok(d) => d,
-                        Err(_e) => {
-                            // We reach here if `now` < `last_activity_ts`, which can legitimately
-                            // happen if there is an access between us getting `now`, and us getting
-                            // the access stats from the layer.
-                            //
-                            // The other reason why it can happen is system clock skew because
-                            // SystemTime::now() is not monotonic, so, even if there is no access
-                            // to the layer after we get `now` at the beginning of this function,
-                            // it could be that `now`  < `last_activity_ts`.
-                            //
-                            // To distinguish the cases, we would need to record `Instant`s in the
-                            // access stats (i.e., monotonic timestamps), but then, the timestamps
-                            // values in the access stats would need to be `Instant`'s, and hence
-                            // they would be meaningless outside of the pageserver process.
-                            // At the time of writing, the trade-off is that access stats are more
-                            // valuable than detecting clock skew.
-                            return false;
-                        }
-                    };
+                if !layer.is_likely_resident() {
+                    continue;
+                }

-                    no_activity_for > p.threshold
-                })
-                .cloned()
-                .for_each(|layer| {
+                let last_activity_ts = layer.access_stats().latest_activity();
+
+                let no_activity_for = match now.duration_since(last_activity_ts) {
+                    Ok(d) => d,
+                    Err(_e) => {
+                        // We reach here if `now` < `last_activity_ts`, which can legitimately
+                        // happen if there is an access between us getting `now`, and us getting
+                        // the access stats from the layer.
+                        //
+                        // The other reason why it can happen is system clock skew because
+                        // SystemTime::now() is not monotonic, so, even if there is no access
+                        // to the layer after we get `now` at the beginning of this function,
+                        // it could be that `now`  < `last_activity_ts`.
+                        //
+                        // To distinguish the cases, we would need to record `Instant`s in the
+                        // access stats (i.e., monotonic timestamps), but then, the timestamps
+                        // values in the access stats would need to be `Instant`'s, and hence
+                        // they would be meaningless outside of the pageserver process.
+                        // At the time of writing, the trade-off is that access stats are more
+                        // valuable than detecting clock skew.
+                        continue;
+                    }
+                };
+
+                if no_activity_for > p.threshold {
                    js.spawn(async move {
                        layer
                            .evict_and_wait(std::time::Duration::from_secs(5))
                            .await
                    });
                    stats.candidates += 1;
-                });
+                }
+            }
        };

        let join_all = async move {
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,967 +0,0 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
-//!
-//! # Motivation
-//!
-//! On a single page service connection, we're typically serving a single TenantTimelineId.
-//!
-//! Without sharding, there is a single Timeline object to which we dispatch
-//! all requests. For example, a getpage request gets dispatched to the
-//! Timeline::get method of the Timeline object that represents the
-//! (tenant,timeline) of that connection.
-//!
-//! With sharding, for each request that comes in on the connection,
-//! we first have to perform shard routing based on the requested key (=~ page number).
-//! The result of shard routing is a Timeline object.
-//! We then dispatch the request to that Timeline object.
-//!
-//! Regardless of whether the tenant is sharded or not, we want to ensure that
-//! we hold the Timeline gate open while we're invoking the method on the
-//! Timeline object.
-//!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
-//!
-//! Regardless of how we accomplish the above, it should not
-//! prevent the Timeline from shutting down promptly.
-//!
-//! # Design
-//!
-//! There are three user-facing data structures:
-//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
-//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
-//!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
-//!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
-//!
-//! To dispatch a request, the page service connection calls `Cache::get`.
-//!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
-//!
-//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
-//!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
-//!
-//! # Memory Management / How The Reference Cycle Is Broken
-//!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
-//!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
-//!
-//! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
-//!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
-//! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
-//!
-//! # Fast Path for Shard Routing
-//!
-//! The `Cache` has a fast path for shard routing to avoid calling into
-//! the tenant manager for every request.
-//!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
-//!
-//! The current implementation uses the first entry in the hash map
-//! to determine the `ShardParameters` and derive the correct
-//! `ShardIndex` for the requested key.
-//!
-//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
-//!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
-//! it's a hit.
-//!
-//! ## Cache invalidation
-//!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
-//! The only reasons why an entry in the cache can become stale are:
-//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
-//!    being detached, timeline or shard deleted, or pageserver is shutting down.
-//! 2. We're doing a shard split and new traffic should be routed to the child shards.
-//!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
-//! timeline has shut down, and when that happens, we remove the entry from the cache.
-//!
-//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
-//! to the parent shard during a shard split. Eventually, the shard split task will
-//! shut down the parent => case (1).
-
-use std::collections::hash_map;
-use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::sync::Weak;
-
-use pageserver_api::shard::ShardIdentity;
-use tracing::instrument;
-use tracing::trace;
-use utils::id::TimelineId;
-use utils::shard::ShardIndex;
-use utils::shard::ShardNumber;
-
-use crate::tenant::mgr::ShardSelector;
-
-/// The requirement for Debug is so that #[derive(Debug)] works in some places.
-pub(crate) trait Types: Sized + std::fmt::Debug {
-    type TenantManagerError: Sized + std::fmt::Debug;
-    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
-}
-
-/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
-/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
-/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
-struct CacheId(u64);
-
-impl CacheId {
-    fn next() -> Self {
-        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("CacheId::new() returned 0, overflow");
-        }
-        Self(id)
-    }
-}
-
-/// See module-level comment.
-pub(crate) struct Cache<T: Types> {
-    id: CacheId,
-    map: Map<T>,
-}
-
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
-
-impl<T: Types> Default for Cache<T> {
-    fn default() -> Self {
-        Self {
-            id: CacheId::next(),
-            map: Default::default(),
-        }
-    }
-}
-
-#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
-pub(crate) struct ShardTimelineId {
-    pub(crate) shard_index: ShardIndex,
-    pub(crate) timeline_id: TimelineId,
-}
-
-/// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
-///
-/// See module-level comment for details.
-pub struct PerTimelineState<T: Types> {
-    // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
-}
-
-impl<T: Types> Default for PerTimelineState<T> {
-    fn default() -> Self {
-        Self {
-            handles: Mutex::new(Some(Default::default())),
-        }
-    }
-}
-
-/// Abstract view of [`crate::tenant::mgr`], for testability.
-pub(crate) trait TenantManager<T: Types> {
-    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
-    /// Errors are returned as [`GetError::TenantManager`].
-    async fn resolve(
-        &self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> Result<T::Timeline, T::TenantManagerError>;
-}
-
-/// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
-    fn shard_timeline_id(&self) -> ShardTimelineId;
-    fn get_shard_identity(&self) -> &ShardIdentity;
-    fn per_timeline_state(&self) -> &PerTimelineState<T>;
-}
-
-/// Errors returned by [`Cache::get`].
-#[derive(Debug)]
-pub(crate) enum GetError<T: Types> {
-    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
-    PerTimelineStateShutDown,
-}
-
-/// Internal type used in [`Cache::get`].
-enum RoutingResult<T: Types> {
-    FastPath(Handle<T>),
-    SlowPath(ShardTimelineId),
-    NeedConsultTenantManager,
-}
-
-impl<T: Types> Cache<T> {
-    /// See module-level comment for details.
-    ///
-    /// Does NOT check for the shutdown state of [`Types::Timeline`].
-    /// Instead, the methods of [`Types::Timeline`] that are invoked through
-    /// the [`Handle`] are responsible for checking these conditions
-    /// and if so, return an error that causes the page service to
-    /// close the connection.
-    #[instrument(level = "trace", skip_all)]
-    pub(crate) async fn get(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
-            let routing_state = self.shard_routing(timeline_id, shard_selector);
-            match routing_state {
-                RoutingResult::FastPath(handle) => return Ok(handle),
-                RoutingResult::SlowPath(key) => match self.map.get(&key) {
-                    Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
-                            trace!("handle cache stale");
-                            self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
-                        }
-                    },
-                    None => ShardSelector::Known(key.shard_index),
-                },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
-            }
-        };
-        self.get_miss(timeline_id, miss, tenant_manager).await
-    }
-
-    #[inline(always)]
-    fn shard_routing(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> RoutingResult<T> {
-        loop {
-            // terminates because when every iteration we remove an element from the map
-            let Some((first_key, first_handle)) = self.map.iter().next() else {
-                return RoutingResult::NeedConsultTenantManager;
-            };
-            let Some(first_handle) = first_handle.upgrade() else {
-                // TODO: dedup with get()
-                trace!("handle cache stale");
-                let first_key_owned = *first_key;
-                self.map.remove(&first_key_owned).unwrap();
-                continue;
-            };
-
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
-            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_handle_shard_identity.count,
-            };
-
-            let need_idx = match shard_selector {
-                ShardSelector::Page(key) => {
-                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
-                }
-                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
-                ShardSelector::Known(shard_idx) => shard_idx,
-            };
-            let need_shard_timeline_id = ShardTimelineId {
-                shard_index: need_idx,
-                timeline_id,
-            };
-            let first_handle_shard_timeline_id = ShardTimelineId {
-                shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
-            };
-
-            if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
-            } else {
-                return RoutingResult::SlowPath(need_shard_timeline_id);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    #[inline(always)]
-    async fn get_miss(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        match tenant_manager.resolve(timeline_id, shard_selector).await {
-            Ok(timeline) => {
-                let key = timeline.shard_timeline_id();
-                match &shard_selector {
-                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
-                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
-                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
-                }
-
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
-                trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
-                    let mut lock_guard = timeline
-                        .per_timeline_state()
-                        .handles
-                        .lock()
-                        .expect("mutex poisoned");
-                    match &mut *lock_guard {
-                        Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
-                            assert!(replaced.is_none(), "some earlier code left a stale handle");
-                            match self.map.entry(key) {
-                                hash_map::Entry::Occupied(_o) => {
-                                    // This cannot not happen because
-                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
-                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
-                                    //    while we were waiting for the tenant manager.
-                                    unreachable!()
-                                }
-                                hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
-                                }
-                            }
-                        }
-                        None => {
-                            return Err(GetError::PerTimelineStateShutDown);
-                        }
-                    }
-                };
-                Ok(Handle(handle))
-            }
-            Err(e) => Err(GetError::TenantManager(e)),
-        }
-    }
-}
-
-impl<T: Types> PerTimelineState<T> {
-    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
-    /// to the [`Types::Timeline`] that embeds this per-timeline state.
-    /// Even if [`TenantManager::resolve`] would still resolve to it.
-    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
-    /// That's ok because they're short-lived. See module-level comment for details.
-    #[instrument(level = "trace", skip_all)]
-    pub(super) fn shutdown(&self) {
-        let handles = self
-            .handles
-            .lock()
-            .expect("mutex poisoned")
-            // NB: this .take() sets locked to None.
-            // That's what makes future `Cache::get` misses fail.
-            // Cache hits are taken care of below.
-            .take();
-        let Some(handles) = handles else {
-            trace!("already shut down");
-            return;
-        };
-        for handle in handles.values() {
-            // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
-        }
-        drop(handles);
-    }
-}
-
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
-// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
-impl<T: Types> Drop for Cache<T> {
-    fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver_api::{
-        key::{rel_block_to_key, Key, DBDIR_KEY},
-        models::ShardParameters,
-        reltag::RelTag,
-        shard::ShardStripeSize,
-    };
-    use utils::shard::ShardCount;
-
-    use super::*;
-
-    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
-
-    #[derive(Debug)]
-    struct TestTypes;
-    impl Types for TestTypes {
-        type TenantManagerError = anyhow::Error;
-        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
-    }
-
-    struct StubManager {
-        shards: Vec<Arc<StubTimeline>>,
-    }
-
-    struct StubTimeline {
-        gate: utils::sync::gate::Gate,
-        id: TimelineId,
-        shard: ShardIdentity,
-        per_timeline_state: PerTimelineState<TestTypes>,
-        myself: Weak<StubTimeline>,
-    }
-
-    impl StubTimeline {
-        fn getpage(&self) {
-            // do nothing
-        }
-    }
-
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
-        fn shard_timeline_id(&self) -> ShardTimelineId {
-            ShardTimelineId {
-                shard_index: self.shard.shard_index(),
-                timeline_id: self.id,
-            }
-        }
-
-        fn get_shard_identity(&self) -> &ShardIdentity {
-            &self.shard
-        }
-
-        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
-            &self.per_timeline_state
-        }
-    }
-
-    impl TenantManager<TestTypes> for StubManager {
-        async fn resolve(
-            &self,
-            timeline_id: TimelineId,
-            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
-            for timeline in &self.shards {
-                if timeline.id == timeline_id {
-                    match &shard_selector {
-                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Zero => continue,
-                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Page(_) => continue,
-                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Known(_) => continue,
-                    }
-                }
-            }
-            anyhow::bail!("not found")
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_timeline_shutdown() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        //
-        // fill the cache
-        //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        let handle: Handle<_> = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
-        assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-        drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-
-        //
-        // demonstrate that Handle holds up gate closure
-        // but shutdown prevents new handles from being handed out
-        //
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("cache and per-timeline handler state keep cache open");
-            }
-            _ = tokio::time::sleep(FOREVER) => {
-                // NB: first poll of close() makes it enter closing state
-            }
-        }
-
-        let handle = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-
-        // SHUTDOWN
-        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
-
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
-        assert_eq!(
-            cache.map.len(),
-            1,
-            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
-
-        // this handle is perfectly usable
-        handle.getpage();
-
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
-        assert_eq!(
-            cache.map.len(),
-            0,
-            "first access after shutdown cleans up the Weak's from the cache"
-        );
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-
-        drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        // closing gate succeeds after dropping handle
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-
-        // map gets cleaned on next lookup
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 0);
-
-        // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
-        drop(shard0);
-        drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
-    }
-
-    #[tokio::test]
-    async fn test_multiple_timelines_and_deletion() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_a = TimelineId::generate();
-        let timeline_b = TimelineId::generate();
-        assert_ne!(timeline_a, timeline_b);
-        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_a,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_b,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mut mgr = StubManager {
-            shards: vec![timeline_a.clone(), timeline_b.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        assert_eq!(cache.map.len(), 2);
-
-        // delete timeline A
-        timeline_a.per_timeline_state.shutdown();
-        mgr.shards.retain(|t| t.id != timeline_a.id);
-        assert!(
-            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
-                .await
-                .is_err(),
-            "broken StubManager implementation"
-        );
-
-        assert_eq!(
-            cache.map.len(),
-            2,
-            "cache still has a Weak handle to Timeline A"
-        );
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
-
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we still have it");
-    }
-
-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
-        rel_block_to_key(
-            RelTag {
-                spcnode: 1663,
-                dbnode: 208101,
-                relnode: 2620,
-                forknum: 0,
-            },
-            shard.0 as u32 * params.stripe_size.0,
-        )
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_shard_split() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let parent = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_params = ShardParameters {
-            count: ShardCount(2),
-            stripe_size: ShardStripeSize::default(),
-        };
-        let child0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child1 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        // fill the cache with the parent
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![parent.clone()],
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent first"
-            );
-            drop(handle);
-        }
-
-        //
-        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
-        //
-
-        // while we haven't shut down the parent, the cache will return the cached parent, even
-        // if the tenant manager returns the child
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent"
-            );
-            drop(handle);
-        }
-
-        let parent_handle = cache
-            .get(
-                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
-                &StubManager {
-                    shards: vec![parent.clone()],
-                },
-            )
-            .await
-            .expect("we have it");
-        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
-
-        // invalidate the cache
-        parent.per_timeline_state.shutdown();
-
-        // the cache will now return the child, even though the parent handle still exists
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(
-                    &handle.myself,
-                    &child_shards_by_shard_number[i as usize].myself
-                ),
-                "mgr returns child"
-            );
-            drop(handle);
-        }
-
-        // all the while the parent handle kept the parent gate open
-        tokio::select! {
-            _ = parent_handle.gate.close() => {
-                panic!("parent handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-        drop(parent_handle);
-        tokio::select! {
-            _ = parent.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("parent handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_connection_handler_exit() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
-        for _ in 0..10 {
-            let mut cache = Cache::<TestTypes>::default();
-            let handle = {
-                let handle = cache
-                    .get(timeline_id, ShardSelector::Page(key), &mgr)
-                    .await
-                    .expect("we have the timeline");
-                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-                handle
-            };
-            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
-        }
-
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, ensure, Context, Result};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
@@ -24,142 +24,35 @@ use crate::{
 use super::TimelineWriterState;

 /// Provides semantic APIs to manipulate the layer map.
-pub(crate) enum LayerManager {
-    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
-    /// the layers.
-    Open(OpenLayerManager),
-    /// Shutdown layer manager where there are no more in-memory layers and persistent layers are
-    /// read-only.
-    Closed {
-        layers: HashMap<PersistentLayerKey, Layer>,
-    },
-}
-
-impl Default for LayerManager {
-    fn default() -> Self {
-        LayerManager::Open(OpenLayerManager::default())
-    }
+#[derive(Default)]
+pub(crate) struct LayerManager {
+    layer_map: LayerMap,
+    layer_fmgr: LayerFileManager<Layer>,
 }

 impl LayerManager {
-    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.layers()
-            .get(key)
-            .with_context(|| format!("get layer from key: {key}"))
-            .expect("not found")
-            .clone()
-    }
-
    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
-        self.get_from_key(&desc.key())
+        self.layer_fmgr.get_from_desc(desc)
    }

    /// Get an immutable reference to the layer map.
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
    /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
-            Closed { .. } => Err(Shutdown),
-        }
+    pub(crate) fn layer_map(&self) -> &LayerMap {
+        &self.layer_map
    }

-    pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
-        use LayerManager::*;
-
-        match self {
-            Open(open) => Ok(open),
-            Closed { .. } => Err(Shutdown),
-        }
-    }
-
-    /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
-    /// order to allow shutdown to complete.
-    ///
-    /// If there was a want to flush in-memory layers, it must have happened earlier.
-    pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager {
-                layer_map,
-                layer_fmgr: LayerFileManager(hashmap),
-            }) => {
-                let open = layer_map.open_layer.take();
-                let frozen = layer_map.frozen_layers.len();
-                let taken_writer_state = writer_state.take();
-                tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
-                let layers = std::mem::take(hashmap);
-                *self = Closed { layers };
-                assert_eq!(open.is_some(), taken_writer_state.is_some());
-            }
-            Closed { .. } => {
-                tracing::debug!("ignoring multiple shutdowns on layer manager")
-            }
-        }
-    }
-
-    /// Sum up the historic layer sizes
-    pub(crate) fn layer_size_sum(&self) -> u64 {
-        self.layers()
-            .values()
-            .map(|l| l.layer_desc().file_size)
-            .sum()
-    }
-
-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
-        self.layers().values().filter(|l| l.is_likely_resident())
-    }
-
-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
-        self.contains_key(&layer.layer_desc().key())
-    }
-
-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layers().contains_key(key)
-    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layers().keys().cloned().collect_vec()
-    }
-
-    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
-        use LayerManager::*;
-        match self {
-            Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
-            Closed { layers } => layers,
-        }
-    }
-}
-
-#[derive(Default)]
-pub(crate) struct OpenLayerManager {
-    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
-}
-
-impl std::fmt::Debug for OpenLayerManager {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("OpenLayerManager")
-            .field("layer_count", &self.layer_fmgr.0.len())
-            .finish()
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-#[error("layer manager has been shutdown")]
-pub(crate) struct Shutdown;
-
-impl OpenLayerManager {
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
+    pub(crate) fn initialize_local_layers(
+        &mut self,
+        on_disk_layers: Vec<Layer>,
+        next_open_layer_at: Lsn,
+    ) {
        let mut updates = self.layer_map.batch_update();
-        for layer in layers {
+        for layer in on_disk_layers {
            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
@@ -171,19 +64,26 @@ impl OpenLayerManager {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

-    /// Open a new writable layer to append data if there is no open layer, otherwise return the
-    /// current open layer, called within `get_layer_for_write`.
+    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
+    /// called within `get_layer_for_write`.
    pub(crate) async fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
+        last_record_lsn: Lsn,
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
-        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<InMemoryLayer>> {
+    ) -> Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());

+        ensure!(
+            lsn > last_record_lsn,
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
+            lsn,
+            last_record_lsn,
+        );
+
        // Do we have a layer open for writing already?
        let layer = if let Some(open_layer) = &self.layer_map.open_layer {
            if open_layer.get_lsn_range().start > lsn {
@@ -209,15 +109,8 @@ impl OpenLayerManager {
                lsn
            );

-            let new_layer = InMemoryLayer::create(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_lsn,
-                gate_guard,
-                ctx,
-            )
-            .await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
@@ -271,7 +164,7 @@ impl OpenLayerManager {
        froze
    }

-    /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
+    /// Add image layers to the layer map, called from `create_image_layers`.
    pub(crate) fn track_new_image_layers(
        &mut self,
        image_layers: &[ResidentLayer],
@@ -344,7 +237,7 @@ impl OpenLayerManager {
        self.finish_compact_l0(compact_from, compact_to, metrics)
    }

-    /// Called post-compaction when some previous generation image layers were trimmed.
+    /// Called when compaction is completed.
    pub(crate) fn rewrite_layers(
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
@@ -362,10 +255,13 @@ impl OpenLayerManager {
                new_layer.layer_desc().lsn_range
            );

-            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
            // always marking rewritten layers as visible.
-            new_layer.as_ref().set_visibility(old_layer.visibility());
+            new_layer
+                .as_ref()
+                .access_stats()
+                .set_visibility(old_layer.access_stats().visibility());

            // Safety: we may never rewrite the same file in-place.  Callers are responsible
            // for ensuring that they only rewrite layers after something changes the path,
@@ -433,6 +329,31 @@ impl OpenLayerManager {
        mapping.remove(layer);
        layer.delete_on_drop();
    }
+
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
+        // for small layer maps, we most likely have all resident, but for larger more are likely
+        // to be evicted assuming lots of layers correlated with longer lifespan.
+
+        self.layer_map().iter_historic_layers().filter_map(|desc| {
+            self.layer_fmgr
+                .0
+                .get(&desc.key())
+                .filter(|l| l.is_likely_resident())
+                .cloned()
+        })
+    }
+
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+        self.layer_fmgr.contains(layer)
+    }
+
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layer_fmgr.contains_key(key)
+    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layer_fmgr.0.keys().cloned().collect_vec()
+    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -444,6 +365,20 @@ impl<T> Default for LayerFileManager<T> {
 }

 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.0
+            .get(&desc.key())
+            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
+            .expect("not found")
+            .clone()
+    }
+
+    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.0.contains_key(key)
+    }
+
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
@@ -451,6 +386,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        }
    }

+    pub(crate) fn contains(&self, layer: &T) -> bool {
+        self.0.contains_key(&layer.layer_desc().key())
+    }
+
    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,10 +122,6 @@ impl CurrentLogicalSize {
            Self::Exact(_) => Accuracy::Exact,
        }
    }
-
-    pub(crate) fn is_exact(&self) -> bool {
-        matches!(self, Self::Exact(_))
-    }
 }

 impl LogicalSize {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -335,9 +335,6 @@ pub(super) async fn handle_walreceiver_connection(
                            filtered_records += 1;
                        }

-                        // FIXME: this cannot be made pausable_failpoint without fixing the
-                        // failpoint library; in tests, the added amount of debugging will cause us
-                        // to timeout the tests.
                        fail_point!("walreceiver-after-ingest");

                        last_rec_lsn = lsn;
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -19,7 +19,6 @@ use std::collections::BTreeMap;
 use std::num::NonZeroUsize;

 use bytes::BytesMut;
-use itertools::Itertools;
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -62,7 +61,7 @@ pub struct VectoredRead {
    pub start: u64,
    pub end: u64,
    /// Starting offsets and metadata for each blob in this read
-    pub blobs_at: VecMap<u64, (u64, BlobMeta)>,
+    pub blobs_at: VecMap<u64, BlobMeta>,
 }

 impl VectoredRead {
@@ -80,7 +79,7 @@ pub(crate) enum VectoredReadExtended {
 pub(crate) struct VectoredReadBuilder {
    start: u64,
    end: u64,
-    blobs_at: VecMap<u64, (u64, BlobMeta)>,
+    blobs_at: VecMap<u64, BlobMeta>,
    max_read_size: Option<usize>,
 }

@@ -98,7 +97,7 @@ impl VectoredReadBuilder {
    ) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
-            .append(start_offset, (end_offset, meta))
+            .append(start_offset, meta)
            .expect("First insertion always succeeds");

        Self {
@@ -123,7 +122,7 @@ impl VectoredReadBuilder {
        } {
            self.end = end;
            self.blobs_at
-                .append(start, (end, meta))
+                .append(start, meta)
                .expect("LSNs are ordered within vectored reads");

            return VectoredReadExtended::Yes;
@@ -271,42 +270,6 @@ impl VectoredReadPlanner {

        reads
    }
-
-    pub fn finish_v2(self) -> Vec<VectoredRead> {
-        const STX_ALIGN: usize = 4096;
-
-        self.blobs
-            .into_iter()
-            .flat_map(|(key, blobs_for_key)| {
-                blobs_for_key
-                    .into_iter()
-                    .map(move |(lsn, start_offset, end_offset)| {
-                        VectoredReadBuilder::new(
-                            start_offset,
-                            end_offset,
-                            BlobMeta { key, lsn },
-                            self.max_read_size,
-                        )
-                    })
-            })
-            .coalesce(|mut x, mut y| {
-                if x.end == y.start && {
-                    if let Some(max_read_size) = x.max_read_size {
-                        x.size() + y.size() <= max_read_size
-                    } else {
-                        true
-                    }
-                } {
-                    if x.blobs_at.extend(&mut y.blobs_at).is_ok() {
-                        x.end = y.end;
-                        return Ok(x);
-                    }
-                }
-                Err((x, y))
-            })
-            .map(|x| x.build())
-            .collect()
-    }
 }

 /// Disk reader for vectored blob spans (does not go through the page cache)
@@ -351,10 +314,21 @@ impl<'a> VectoredBlobReader<'a> {

        let mut metas = Vec::with_capacity(blobs_at.len());

+        // Blobs in `read` only provide their starting offset. The end offset
+        // of a blob is implicit: the start of the next blob if one exists
+        // or the end of the read.
+        let pairs = blobs_at.iter().zip(
+            blobs_at
+                .iter()
+                .map(Some)
+                .skip(1)
+                .chain(std::iter::once(None)),
+        );
+
        // Some scratch space, put here for reusing the allocation
        let mut decompressed_vec = Vec::new();

-        for (offset, (end_offset, meta)) in blobs_at.iter() {
+        for ((offset, meta), next) in pairs {
            let offset_in_buf = offset - start_offset;
            let first_len_byte = buf[offset_in_buf as usize];

@@ -380,8 +354,10 @@ impl<'a> VectoredBlobReader<'a> {
            };

            let start_raw = offset_in_buf + size_length;
-            let end_raw = *end_offset;
-
+            let end_raw = match next {
+                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
+                None => start_raw + blob_size,
+            };
            assert_eq!(end_raw - start_raw, blob_size);
            let (start, end);
            if compression_bits == BYTE_UNCOMPRESSED {
@@ -493,7 +469,7 @@ impl StreamingVectoredReadPlanner {
                self.read_builder = {
                    let mut blobs_at = VecMap::default();
                    blobs_at
-                        .append(start_offset, (end_offset, BlobMeta { key, lsn }))
+                        .append(start_offset, BlobMeta { key, lsn })
                        .expect("First insertion always succeeds");

                    Some(VectoredReadBuilder {
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -5,17 +5,12 @@

 use anyhow::Context;
 use std::path::Path;
-use utils::serde_percent::Percent;

 use pageserver_api::models::PageserverUtilization;

-use crate::{config::PageServerConf, tenant::mgr::TenantManager};
+pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
+    // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough

-pub(crate) fn regenerate(
-    conf: &PageServerConf,
-    tenants_path: &Path,
-    tenant_manager: &TenantManager,
-) -> anyhow::Result<PageserverUtilization> {
    let statvfs = nix::sys::statvfs::statvfs(tenants_path)
        .map_err(std::io::Error::from)
        .context("statvfs tenants directory")?;
@@ -39,31 +34,16 @@ pub(crate) fn regenerate(

    let captured_at = std::time::SystemTime::now();

-    // Calculate aggregate utilization from tenants on this pageserver
-    let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;
-
-    // Fetch the fraction of disk space which may be used
-    let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
-        Some(e) => e.max_usage_pct,
-        None => Percent::new(100).unwrap(),
-    };
-
-    // Express a static value for how many shards we may schedule on one node
-    const MAX_SHARDS: u32 = 20000;
-
-    let mut doc = PageserverUtilization {
+    let doc = PageserverUtilization {
        disk_usage_bytes: used,
        free_space_bytes: free,
-        disk_wanted_bytes,
-        disk_usable_pct,
-        shard_count,
-        max_shard_count: MAX_SHARDS,
-        utilization_score: 0,
+        // lower is better; start with a constant
+        //
+        // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
+        utilization_score: u64::MAX,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };

-    doc.refresh_score();
-
    // TODO: make utilization_score into a metric

    Ok(doc)
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -30,12 +30,10 @@ use tokio::time::Instant;
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
 pub use io_engine::feature_test as io_engine_feature_test;
-pub use io_engine::io_engine_for_bench;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -328,29 +328,3 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
    .join()
    .unwrap()
 }
-
-/// For use in benchmark binaries only.
-///
-/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
-/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
-/// developer time trying to figure out why it's slow.
-///
-/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
-pub fn io_engine_for_bench() -> IoEngineKind {
-    #[cfg(not(target_os = "linux"))]
-    {
-        panic!("This benchmark does I/O and can only give a representative result on Linux");
-    }
-    #[cfg(target_os = "linux")]
-    {
-        match feature_test().unwrap() {
-            FeatureTestResult::PlatformPreferred(engine) => engine,
-            FeatureTestResult::Worse {
-                engine: _engine,
-                remark,
-            } => {
-                panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
-            }
-        }
-    }
-}
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -107,10 +107,8 @@ enum ProcessOnceCell {
 }

 struct Process {
-    process: process::WalRedoProcess,
-    /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`].
-    /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit).
    _launched_processes_guard: utils::sync::gate::GateGuard,
+    process: process::WalRedoProcess,
 }

 impl std::ops::Deref for Process {
@@ -243,9 +241,6 @@ impl PostgresRedoManager {

    /// Shut down the WAL redo manager.
    ///
-    /// Returns `true` if this call was the one that initiated shutdown.
-    /// `true` may be observed by no caller if the first caller stops polling.
-    ///
    /// After this future completes
    /// - no redo process is running
    /// - no new redo process will be spawned
@@ -255,32 +250,22 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn shutdown(&self) -> bool {
+    pub async fn shutdown(&self) {
        // prevent new processes from being spawned
-        let maybe_permit = match self.redo_process.get_or_init_detached().await {
+        let permit = match self.redo_process.get_or_init_detached().await {
            Ok(guard) => {
-                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
-                    None
-                } else {
-                    let (proc, permit) = guard.take_and_deinit();
-                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
-                    Some(permit)
-                }
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
            }
-            Err(permit) => Some(permit),
-        };
-        let it_was_us = if let Some(permit) = maybe_permit {
-            self.redo_process
-                .set(ProcessOnceCell::ManagerShutDown, permit);
-            true
-        } else {
-            false
+            Err(permit) => permit,
        };
+        self.redo_process
+            .set(ProcessOnceCell::ManagerShutDown, permit);
        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
        // for the underlying process.
        self.launched_processes.close().await;
-        it_was_us
    }

    /// This type doesn't have its own background task to check for idleness: we
@@ -329,23 +314,20 @@ impl PostgresRedoManager {
                },
                Err(permit) => {
                    let start = Instant::now();
-                    // acquire guard before spawning process, so that we don't spawn new processes
-                    // if the gate is already closed.
-                    let _launched_processes_guard = match self.launched_processes.enter() {
+                    let proc = Arc::new(Process {
+                            _launched_processes_guard: match self.launched_processes.enter() {
                                Ok(guard) => guard,
                                Err(GateError::GateClosed) => unreachable!(
                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
                                ),
-                            };
-                    let proc = Arc::new(Process {
-                        process: process::WalRedoProcess::launch(
-                            self.conf,
-                            self.tenant_shard_id,
-                            pg_version,
-                        )
-                        .context("launch walredo process")?,
-                        _launched_processes_guard,
-                    });
+                            },
+                            process: process::WalRedoProcess::launch(
+                                self.conf,
+                                self.tenant_shard_id,
+                                pg_version,
+                            )
+                            .context("launch walredo process")?,
+                        });
                    let duration = start.elapsed();
                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
                    info!(
--- a/pageserver/test_data/indices/mixed_workload/README.md
+++ b/pageserver/test_data/indices/mixed_workload/README.md
@@ -1,7 +0,0 @@
-
-# This was captured from one shard of a large tenant in staging.
-
-# It has a mixture of deltas and image layers, >1000 layers in total.
-
-# This is suitable for general smoke tests that want an index which is not
-# trivially small, but doesn't contain weird/pathological cases.
--- a/pageserver/test_data/indices/mixed_workload/index_part.json
+++ b/pageserver/test_data/indices/mixed_workload/index_part.json
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -45,7 +45,6 @@ static const char *jwt_token = NULL;
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
-static bool RegressTestMode = false;

 /*
 * CURL docs say that this buffer must exist until we call curl_easy_cleanup
@@ -803,14 +802,6 @@ NeonProcessUtility(
 		case T_DropRoleStmt:
 			HandleDropRole(castNode(DropRoleStmt, parseTree));
 			break;
-		case T_CreateTableSpaceStmt:
-			if (!RegressTestMode)
-			{
-				ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					errmsg("CREATE TABLESPACE is not supported on Neon")));
-			}
-   			break;
 		default:
 			break;
 	}
@@ -873,18 +864,6 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);

-	DefineCustomBoolVariable(
-							 "neon.regress_test_mode",
-							 "Controls whether we are running in the regression test mode",
-							 NULL,
-							 &RegressTestMode,
-							 false,
-							 PGC_SUSET,
-							 0,
-							 NULL,
-							 NULL,
-							 NULL);
-
 	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -32,7 +32,6 @@
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
-#include "utils/guc_tables.h"
 #include "utils/wait_event.h"

 #include "extension_server.h"
@@ -69,10 +68,10 @@ InitLogicalReplicationMonitor(void)

 	DefineCustomIntVariable(
 							"neon.logical_replication_max_snap_files",
-							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							"Maximum allowed logical replication .snap files",
 							NULL,
 							&logical_replication_max_snap_files,
-							300, -1, INT_MAX,
+							300, 0, INT_MAX,
 							PGC_SIGHUP,
 							0,
 							NULL, NULL, NULL);
@@ -585,40 +584,6 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 	return false;
 }

-
-/*
- * pgbouncer is able to track GUCs reported by Postgres.
- * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones
- * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres:
- * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be
- * This code sets GUC_REPORT flag for `search_path`making it possible to include it in
- * pgbouncer's `track_extra_parameters` list.
- *
- * This code is inspired by how the Citus extension does this, see
- * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694
- */
-static void
-ReportSearchPath(void)
-{
-#if PG_VERSION_NUM >= 160000
-	int nGucs = 0;
-	struct config_generic **gucs = get_guc_variables(&nGucs);
-#else
-	struct config_generic **gucs = get_guc_variables();
-	int nGucs = GetNumConfigOptions();
-#endif
-
-	for (int i = 0; i < nGucs; i++)
-	{
-		struct config_generic *guc = (struct config_generic *) gucs[i];
-
-		if (strcmp(guc->name, "search_path") == 0)
-		{
-			guc->flags |= GUC_REPORT;
-		}
-	}
-}
-
 void
 _PG_init(void)
 {
@@ -634,7 +599,6 @@ _PG_init(void)
 	pg_init_walproposer();
 	WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
-	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitLogicalReplicationMonitor();

@@ -662,8 +626,6 @@ _PG_init(void)
 	 * extension was loaded will be removed.
 	 */
 	EmitWarningsOnPlaceholders("neon");
-
-	ReportSearchPath();
 }

 PG_FUNCTION_INFO_V1(pg_cluster_size);
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -512,7 +512,7 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe
 }

 /*
- * Start walproposer streaming replication
+ * Start walsender streaming replication
 */
 static void
 walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -20,7 +20,6 @@
 #include "utils/guc.h"
 #include "postmaster/interrupt.h"

-#include "neon.h"
 #include "neon_walreader.h"
 #include "walproposer.h"

@@ -182,13 +181,6 @@ NeonWALReadSegmentClose(XLogReaderState *xlogreader)
 void
 NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
 {
-	/*
-	 * If safekeepers are not configured, assume we don't need neon_walreader,
-	 * i.e. running neon fork locally.
-	 */
-	if (wal_acceptors_list[0] == '\0')
-		return;
-
 	if (!wal_reader)
 	{
 		XLogRecPtr	epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,103 +1,91 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

-[[package]]
-name = "aiohappyeyeballs"
-version = "2.3.5"
-description = "Happy Eyeballs for asyncio"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
-    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
-]
-
 [[package]]
 name = "aiohttp"
-version = "3.10.2"
+version = "3.9.4"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
-    {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
-    {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
-    {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
-    {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
-    {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
-    {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
-    {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
-    {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
-    {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
-    {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
+    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
 ]

 [package.dependencies]
-aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
 async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
@@ -106,7 +94,7 @@ multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"

 [package.extras]
-speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
+speedups = ["Brotli", "aiodns", "brotlicffi"]

 [[package]]
 name = "aiopg"
@@ -1526,20 +1514,6 @@ files = [
 [package.dependencies]
 six = "*"

-[[package]]
-name = "kafka-python"
-version = "2.0.2"
-description = "Pure Python client for Apache Kafka"
-optional = false
-python-versions = "*"
-files = [
-    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
-    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
-]
-
-[package.extras]
-crc32c = ["crc32c"]
-
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3383,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
+content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,7 +92,6 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
-try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -218,7 +218,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
    pub fn check_rate_limit(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        config: &AuthenticationConfig,
        secret: AuthSecret,
        endpoint: &EndpointId,
@@ -243,7 +243,7 @@ impl AuthenticationConfig {
        let limit_not_exceeded = self.rate_limiter.check(
            (
                endpoint_int,
-                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
            ),
            password_weight,
        );
@@ -274,7 +274,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    api: &impl console::Api,
    user_info: ComputeUserInfoMaybeEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -303,8 +303,8 @@ async fn auth_quirks(
    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;

    // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
    }

    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -356,7 +356,7 @@ async fn auth_quirks(
 }

 async fn authenticate_with_secret(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    secret: AuthSecret,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
        self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
@@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 impl BackendType<'_, ComputeUserInfo, &()> {
    pub async fn get_role_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {

    pub async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
    async fn wake_compute(
        &self,
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -571,7 +571,7 @@ mod tests {
    impl console::Api for Auth {
        async fn get_role_secret(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -579,7 +579,7 @@ mod tests {

        async fn get_allowed_ips_and_secret(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
        {
@@ -591,7 +591,7 @@ mod tests {

        async fn wake_compute(
            &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &mut RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
            unimplemented!()
@@ -665,7 +665,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -723,7 +723,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
@@ -742,7 +742,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -775,7 +775,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
@@ -794,7 +794,7 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -828,7 +828,7 @@ mod tests {
        ));

        let creds = auth_quirks(
-            &ctx,
+            &mut ctx,
            &api,
            user_info,
            &mut stream,
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    creds: ComputeUserInfo,
    client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
        }
        AuthSecret::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret, ctx);
+            let scram = auth::Scram(&secret, &mut *ctx);

            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -18,7 +18,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

    let ep = EndpointIdInt::from(&info.endpoint);

@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    info: ComputeUserInfoNoEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 }

 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -84,7 +84,7 @@ pub fn endpoint_sni(

 impl ComputeUserInfoMaybeEndpoint {
    pub fn parse(
-        ctx: &RequestMonitoring,
+        ctx: &mut RequestMonitoring,
        params: &StartupMessageParams,
        sni: Option<&str>,
        common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -264,8 +264,8 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -279,9 +279,9 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
        assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -311,8 +311,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -329,8 +329,8 @@ mod tests {
            ),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -344,8 +344,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
+        let mut ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -359,9 +359,9 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));

@@ -374,16 +374,16 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        Ok(())
@@ -397,9 +397,10 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let ctx = RequestMonitoring::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
+        let mut ctx = RequestMonitoring::test();
+        let err =
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
+                .expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -416,9 +417,10 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let ctx = RequestMonitoring::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
+        let mut ctx = RequestMonitoring::test();
+        let err =
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
+                .expect_err("should fail");
        match err {
            UnknownCommonName { cn } => {
                assert_eq!(cn, "localhost");
@@ -436,9 +438,9 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let ctx = RequestMonitoring::test();
+        let mut ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
        assert_eq!(
            user_info.options.get_cache_key("project"),
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -27,7 +27,7 @@ pub trait AuthMethod {
 pub struct Begin;

 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);

 impl AuthMethod for Scram<'_> {
    #[inline(always)]
@@ -155,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        let Scram(secret, ctx) = self.state;

        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

        // Initial client message contains the chosen auth method's name.
        let msg = self.stream.read_password_message().await?;
@@ -168,8 +168,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        }

        match sasl.method {
-            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
-            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
+            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => {
+                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
+            }
            _ => {}
        }
        info!("client chooses {}", sasl.method);
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -205,7 +205,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";

 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
+    ctx: &mut RequestMonitoring,
    raw_stream: S,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
@@ -256,13 +256,13 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }

 async fn handle_client(
-    ctx: RequestMonitoring,
+    mut ctx: RequestMonitoring,
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;

    // Cut off first part of the SNI domain
    // We receive required destination details in the format of
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -5,7 +5,6 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
-use aws_config::Region;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::AuthRateLimiter;
@@ -291,10 +290,9 @@ async fn main() -> anyhow::Result<()> {
    let config = build_config(&args)?;

    info!("Authentication backend: {}", config.auth_backend);
-    info!("Using region: {}", args.aws_region);
+    info!("Using region: {}", config.aws_region);

-    let region_provider =
-        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
+    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
    let provider_conf =
        ProviderConfig::without_region().with_region(region_provider.region().await);
    let aws_credentials_provider = {
@@ -320,7 +318,7 @@ async fn main() -> anyhow::Result<()> {
    };
    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
        elasticache::AWSIRSAConfig::new(
-            args.aws_region.clone(),
+            config.aws_region.clone(),
            args.redis_cluster_name,
            args.redis_user_id,
        ),
@@ -378,14 +376,11 @@ async fn main() -> anyhow::Result<()> {

    let cancel_map = CancelMap::default();

-    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
-    RateBucketInfo::validate(redis_rps_limit)?;
-
    let redis_publisher = match &regional_redis_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
-            redis_rps_limit,
+            &config.redis_rps_limit,
        )?))),
        None => None,
    };
@@ -661,6 +656,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
+        request_timeout: args.sql_over_http.sql_over_http_timeout,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -680,6 +676,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
    };

+    let mut redis_rps_limit = args.redis_rps_limit.clone();
+    RateBucketInfo::validate(&mut redis_rps_limit)?;
+
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
@@ -688,8 +687,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        http_config,
        authentication_config,
        require_client_ip: args.require_client_ip,
+        disable_ip_check_for_http: args.disable_ip_check_for_http,
+        redis_rps_limit,
        handshake_timeout: args.handshake_timeout,
        region: args.region.clone(),
+        aws_region: args.aws_region.clone(),
        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
        connect_compute_locks,
        connect_to_compute_retry_config: config::RetryConfig::parse(
--- a/Show More
+++ b/Show More