Merge pull request #8505 from neondatabase/rc/proxy/2024-07-25

Proxy release 2024-07-25
2026-06-03 13:30:38 +00:00 · 2024-07-25 11:07:19 +01:00
parent 98355a419a 6fc2726568
commit b0d69acb07
215 changed files with 8582 additions and 4282 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# allows for nicer hunk headers with git show
+*.rs diff=rust
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 15'
-    default: '15'
+    description: 'Postgres version; default is 16'
+    default: '16'
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -0,0 +1,291 @@
+name: Build and Test Locally
+
+on:
+  workflow_call:
+    inputs:
+      arch:
+        description: 'x64 or arm64'
+        required: true
+        type: string
+      build-tag:
+        description: 'build tag'
+        required: true
+        type: string
+      build-tools-image:
+        description: 'build-tools image'
+        required: true
+        type: string
+      build-type:
+        description: 'debug or release'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+jobs:
+  build-neon:
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    env:
+      BUILD_TYPE: ${{ inputs.build-type }}
+      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ inputs.build-tag }}
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      # Set some environment variables used by all the steps.
+      #
+      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
+      #   It also includes --features, if any
+      #
+      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
+      #   because "cargo metadata" doesn't accept --release or --debug options
+      #
+      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
+      # corresponding Cargo.toml files for their descriptions.
+      - name: Set env variables
+        run: |
+          CARGO_FEATURES="--features testing"
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
+            CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked --release"
+          fi
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      # Do install *before* running rust tests because they might recompile the
+      # binaries with different features/flags.
+      - name: Install rust binaries
+        run: |
+          # Install target binaries
+          mkdir -p /tmp/neon/bin/
+          binaries=$(
+            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
+            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+          )
+          for bin in $binaries; do
+            SRC=target/$BUILD_TYPE/$bin
+            DST=/tmp/neon/bin/$bin
+            cp "$SRC" "$DST"
+          done
+
+          # Install test executables and write list of all binaries (for code coverage)
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            # Keep bloated coverage data files away from the rest of the artifact
+            mkdir -p /tmp/coverage/
+
+            mkdir -p /tmp/neon/test_bin/
+
+            test_exe_paths=$(
+              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+            for bin in $test_exe_paths; do
+              SRC=$bin
+              DST=/tmp/neon/test_bin/$(basename $bin)
+
+              # We don't need debug symbols for code coverage, so strip them out to make
+              # the artifact smaller.
+              strip "$SRC" -o "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
+            done
+
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
+          fi
+
+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
+      - name: Install postgres binaries
+        run: cp -a pg_install /tmp/neon/pg_install
+
+      - name: Upload Neon artifact
+        uses: ./.github/actions/upload
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
+          path: /tmp/neon
+
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: inputs.build-type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  regress-tests:
+    # Run test on x64 only
+    if: inputs.arch == 'x64'
+    needs: [ build-neon ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    strategy:
+      fail-fast: false
+      matrix:
+        pg_version: [ v14, v15, v16 ]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Pytest regression tests
+        uses: ./.github/actions/run-python-test-set
+        timeout-minutes: 60
+        with:
+          build_type: ${{ inputs.build-type }}
+          test_selection: regress
+          needs_postgres_source: true
+          run_with_real_s3: true
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
+          rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
+          BUILD_TAG: ${{ inputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true
+
+      # Temporary disable this step until we figure out why it's so flaky
+      # Ref https://github.com/neondatabase/neon/issues/4540
+      - name: Merge and upload coverage data
+        if: |
+          false &&
+          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
+        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,15 +56,27 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - DEFAULT_PG_VERSION: 16
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
+          - DEFAULT_PG_VERSION: 16
+            PLATFORM: "azure-staging"
+            region_id: 'azure-eastus2'
+            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
+      PLATFORM: ${{ matrix.PLATFORM }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -85,9 +97,10 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -96,13 +109,14 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
        extra_params:
          -m remote_cluster
          --sparse-ordering
-          --timeout 5400
+          --timeout 14400
          --ignore test_runner/performance/test_perf_olap.py
          --ignore test_runner/performance/test_perf_pgvector_queries.py
          --ignore test_runner/performance/test_logical_replication.py
@@ -133,6 +147,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  replication-tests:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 14
@@ -177,6 +192,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -202,11 +218,14 @@ jobs:
    # Available platforms:
    # - neon-captest-new: Freshly created project (1 CU)
    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
+    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
+      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
    runs-on: ubuntu-22.04
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -217,23 +236,33 @@ jobs:
    - name: Generate matrix for pgbench benchmark
      id: pgbench-compare-matrix
      run: |
+        region_id_default=${{ env.DEFAULT_REGION_ID }}
        matrix='{
+          "pg_version" : [
+            16
+          ],
+          "region_id" : [
+            "'"$region_id_default"'"
+            ],
          "platform": [
            "neon-captest-new",
            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -285,7 +314,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -310,14 +339,14 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
@@ -330,7 +359,7 @@ jobs:
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -355,6 +384,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -368,6 +398,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -381,6 +412,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -407,6 +439,13 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "azure-captest-pgvector"
+            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -414,8 +453,9 @@ jobs:
      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
+      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-captest-pgvector"
+      PLATFORM: ${{ matrix.PLATFORM }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -425,17 +465,39 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
+    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
+    # instead of using Neon artifacts containing pgbench
+    - name: Install postgresql-16 where pytest expects it
+      run: |
+        cd /home/nonroot
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
+        mkdir -p /tmp/neon/pg_install/v16/bin
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
+        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
+        /tmp/neon/pg_install/v16/bin/pgbench --version
+        /tmp/neon/pg_install/v16/bin/psql --version

    - name: Set up Connection String
      id: set-up-connstr
      run: |
-        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        case "${PLATFORM}" in
+          neon-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+            ;;
+          azure-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

@@ -447,6 +509,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -460,6 +523,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -474,11 +538,10 @@ jobs:
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -722,6 +785,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -125,7 +125,11 @@ jobs:

  check-codestyle-rust:
    needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
      credentials:
@@ -193,291 +197,27 @@ jobs:
        if: ${{ !cancelled() }}
        run: cargo deny check --hide-inclusion-graph

-  build-neon:
-    needs: [ check-permissions, tag, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # Raise locked memory limit for tokio-epoll-uring.
-      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
-      # io_uring will account the memory of the CQ and SQ as locked.
-      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
    strategy:
      fail-fast: false
      matrix:
-        build_type: [ debug, release ]
-    env:
-      BUILD_TYPE: ${{ matrix.build_type }}
-      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
-      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-
-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
-      # Set some environment variables used by all the steps.
-      #
-      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
-      #   It also includes --features, if any
-      #
-      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
-      #   because "cargo metadata" doesn't accept --release or --debug options
-      #
-      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
-      # corresponding Cargo.toml files for their descriptions.
-      - name: Set env variables
-        run: |
-          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FLAGS="--locked"
-          elif [[ $BUILD_TYPE == "release" ]]; then
-            cov_prefix=""
-            CARGO_FLAGS="--locked --release"
-          fi
-          {
-            echo "cov_prefix=${cov_prefix}"
-            echo "CARGO_FEATURES=${CARGO_FEATURES}"
-            echo "CARGO_FLAGS=${CARGO_FLAGS}"
-            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
-          } >> $GITHUB_ENV
-
-      # Disabled for now
-      # Don't include the ~/.cargo/registry/src directory. It contains just
-      # uncompressed versions of the crates in ~/.cargo/registry/cache
-      # directory, and it's faster to let 'cargo' to rebuild it from the
-      # compressed crates.
-#      - name: Cache cargo deps
-#        id: cache_cargo
-#        uses: actions/cache@v4
-#        with:
-#          path: |
-#            ~/.cargo/registry/
-#            !~/.cargo/registry/src
-#            ~/.cargo/git/
-#            target/
-#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
-#          key: |
-#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
-
-      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
-
-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
-      - name: Run cargo build
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
-
-      # Do install *before* running rust tests because they might recompile the
-      # binaries with different features/flags.
-      - name: Install rust binaries
-        run: |
-          # Install target binaries
-          mkdir -p /tmp/neon/bin/
-          binaries=$(
-            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
-            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
-          )
-          for bin in $binaries; do
-            SRC=target/$BUILD_TYPE/$bin
-            DST=/tmp/neon/bin/$bin
-            cp "$SRC" "$DST"
-          done
-
-          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            # Keep bloated coverage data files away from the rest of the artifact
-            mkdir -p /tmp/coverage/
-
-            mkdir -p /tmp/neon/test_bin/
-
-            test_exe_paths=$(
-              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
-              jq -r '.executable | select(. != null)'
-            )
-            for bin in $test_exe_paths; do
-              SRC=$bin
-              DST=/tmp/neon/test_bin/$(basename $bin)
-
-              # We don't need debug symbols for code coverage, so strip them out to make
-              # the artifact smaller.
-              strip "$SRC" -o "$DST"
-              echo "$DST" >> /tmp/coverage/binaries.list
-            done
-
-            for bin in $binaries; do
-              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
-            done
-          fi
-
-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
-      - name: Install postgres binaries
-        run: cp -a pg_install /tmp/neon/pg_install
-
-      - name: Upload Neon artifact
-        uses: ./.github/actions/upload
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
-          path: /tmp/neon
-
-      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
-      - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug'
-        uses: ./.github/actions/save-coverage-data
-
-  regress-tests:
-    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-        pg_version: [ v14, v15, v16 ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Pytest regression tests
-        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
-        with:
-          build_type: ${{ matrix.build_type }}
-          test_selection: regress
-          needs_postgres_source: true
-          run_with_real_s3: true
-          real_s3_bucket: neon-github-ci-tests
-          real_s3_region: eu-central-1
-          rerun_flaky: true
-          pg_version: ${{ matrix.pg_version }}
-        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
-          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true
-
-      # Temporary disable this step until we figure out why it's so flaky
-      # Ref https://github.com/neondatabase/neon/issues/4540
-      - name: Merge and upload coverage data
-        if: |
-          false &&
-          matrix.build_type == 'debug' && matrix.pg_version == 'v14'
-        uses: ./.github/actions/save-coverage-data
+        arch: [ x64 ]
+        build-type: [ debug, release ]
+        include:
+          - build-type: release
+            arch: arm64
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+    secrets: inherit

+  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
  get-benchmarks-durations:
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    outputs:
      json: ${{ steps.get-benchmark-durations.outputs.json }}
    needs: [ check-permissions, build-build-tools-image ]
@@ -488,7 +228,6 @@ jobs:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -513,7 +252,8 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
+    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -522,7 +262,6 @@ jobs:
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
      matrix:
@@ -570,7 +309,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -621,7 +360,7 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests, build-build-tools-image ]
+    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -772,7 +511,8 @@ jobs:
          pull: true
          file: Dockerfile
          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
+          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
          tags: |
            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -865,7 +605,8 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
+          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -885,7 +626,8 @@ jobs:
          file: Dockerfile.compute-node
          target: neon-pg-ext-test
          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
+          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}

@@ -1223,7 +965,7 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, gen3, small ]
@@ -1324,7 +1066,7 @@ jobs:
            })

  promote-compatibility-data:
-    needs: [ check-permissions, promote-images, tag, regress-tests ]
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
    if: github.ref_name == 'release'

    runs-on: [ self-hosted, gen3, small ]
@@ -1363,7 +1105,7 @@ jobs:
          done

  pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images, regress-tests ]
+    needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
    if: github.ref_name == 'main'
    uses: ./.github/workflows/pin-build-tools-image.yml
    with:
@@ -1385,7 +1127,7 @@ jobs:
    needs:
      - check-codestyle-python
      - check-codestyle-rust
-      - regress-tests
+      - build-and-test-locally
      - test-images
    runs-on: ubuntu-22.04
    steps:
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -133,221 +133,6 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

-  check-linux-arm-build:
-    needs: [ check-permissions, build-build-tools-image ]
-    timeout-minutes: 90
-    runs-on: [ self-hosted, small-arm64 ]
-
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
-      CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --release
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-
-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
-      - name: Set env variables
-        run: |
-          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
-
-      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
-
-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
-      - name: Run cargo build
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
-
-      - name: Run cargo test
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
-          cargo nextest run $CARGO_FEATURES -j$(nproc)
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
-
-  check-codestyle-rust-arm:
-    needs: [ check-permissions, build-build-tools-image ]
-    timeout-minutes: 90
-    runs-on: [ self-hosted, small-arm64 ]
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      # Some of our rust modules use FFI and need those to be checked
-      - name: Get postgres headers
-        run: make postgres-headers -j$(nproc)
-
-      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
-      # This will catch compiler & clippy warnings in all feature combinations.
-      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
-      # NB: keep clippy args in sync with ./run_clippy.sh
-      - run: |
-          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
-          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
-            echo "No clippy args found in .neon_clippy_args"
-            exit 1
-          fi
-          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
-
-      - name: Run cargo clippy (debug)
-        if: matrix.build_type == 'debug'
-        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
-      - name: Run cargo clippy (release)
-        if: matrix.build_type == 'release'
-        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
-
-      - name: Check documentation generation
-        if: matrix.build_type == 'release'
-        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
-      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
-      - name: Check formatting
-        if: ${{ !cancelled() && matrix.build_type == 'release' }}
-        run: cargo fmt --all -- --check
-
-      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
-      - name: Check rust dependencies
-        if: ${{ !cancelled() && matrix.build_type == 'release' }}
-        run: |
-          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
-          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() && matrix.build_type == 'release' }}
-        run: cargo deny check
-
  gather-rust-build-stats:
    needs: [ check-permissions, build-build-tools-image ]
    if: |
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -261,15 +261,6 @@ version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"

-[[package]]
-name = "atomic-polyfill"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -1236,6 +1227,7 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
+ "rlimit",
 "rust-ini",
 "serde",
 "serde_json",
@@ -1367,6 +1359,7 @@ dependencies = [
 "tracing",
 "url",
 "utils",
+ "whoami",
 "workspace_hack",
 ]

@@ -1449,12 +1442,6 @@ dependencies = [
 "itertools",
 ]

-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.8"
@@ -2027,16 +2014,6 @@ dependencies = [
 "tokio-util",
 ]

-[[package]]
-name = "fs2"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2290,15 +2267,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "hash32"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -2347,18 +2315,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "heapless"
-version = "0.8.0"
-source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.8",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -2392,16 +2348,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"

-[[package]]
-name = "histogram"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b"
-dependencies = [
- "serde",
- "thiserror",
-]
-
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -3242,16 +3188,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3547,12 +3483,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -4413,6 +4343,7 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
+ "typed-json",
 "url",
 "urlencoding",
 "utils",
@@ -4611,6 +4542,15 @@ dependencies = [
 "bitflags 1.3.2",
 ]

+[[package]]
+name = "redox_syscall"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.2"
@@ -4672,6 +4612,7 @@ name = "remote_storage"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-stream",
 "async-trait",
 "aws-config",
 "aws-credential-types",
@@ -4901,6 +4842,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "rlimit"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "routerify"
 version = "3.0.0"
@@ -5169,7 +5119,6 @@ dependencies = [
 "crc32c",
 "desim",
 "fail",
- "fs2",
 "futures",
 "git-version",
 "hex",
@@ -5196,6 +5145,8 @@ dependencies = [
 "sha2",
 "signal-hook",
 "storage_broker",
+ "strum",
+ "strum_macros",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
@@ -5704,9 +5655,6 @@ name = "spin"
 version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-dependencies = [
- "lock_api",
-]

 [[package]]
 name = "spki"
@@ -5728,12 +5676,6 @@ dependencies = [
 "der 0.7.8",
 ]

-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -5810,6 +5752,28 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "storage_controller_client"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "futures",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres",
+ "reqwest 0.12.4",
+ "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storage_scrubber"
 version = "0.1.0"
@@ -5829,7 +5793,6 @@ dependencies = [
 "futures",
 "futures-util",
 "hex",
- "histogram",
 "humantime",
 "itertools",
 "once_cell",
@@ -5844,6 +5807,7 @@ dependencies = [
 "serde",
 "serde_json",
 "serde_with",
+ "storage_controller_client",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -5873,6 +5837,7 @@ dependencies = [
 "reqwest 0.12.4",
 "serde",
 "serde_json",
+ "storage_controller_client",
 "thiserror",
 "tokio",
 "tracing",
@@ -6500,17 +6465,6 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

-[[package]]
-name = "trace"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "pageserver_api",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "tracing"
 version = "0.1.37"
@@ -6610,7 +6564,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
- "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -6675,6 +6628,16 @@ dependencies = [
 "static_assertions",
 ]

+[[package]]
+name = "typed-json"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -6809,7 +6772,6 @@ dependencies = [
 "criterion",
 "fail",
 "futures",
- "heapless",
 "hex",
 "hex-literal",
 "humantime",
@@ -6971,6 +6933,12 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.92"
@@ -7123,6 +7091,17 @@ dependencies = [
 "once_cell",
 ]

+[[package]]
+name = "whoami"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
+dependencies = [
+ "redox_syscall 0.4.1",
+ "wasite",
+ "web-sys",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,9 +13,9 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
+    "storage_controller/client",
    "storage_scrubber",
    "workspace_hack",
-    "trace",
    "libs/compute_api",
    "libs/pageserver_api",
    "libs/postgres_ffi",
@@ -84,7 +84,6 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
-fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
@@ -184,14 +183,16 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
+typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.7"
 x509-parser = "0.15"
+whoami = "1.5.1"

 ## TODO replace this with tracing
 env_logger = "0.10"
@@ -203,9 +204,6 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

-## Other git libraries
-heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
-
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
@@ -221,6 +219,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
+storage_controller_client = { path = "./storage_controller/client" }
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
--- a/18
+++ b/18
@@ -93,13 +93,14 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
 # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
-RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
-    && /usr/local/bin/pageserver -D /data/.neon/ --init \
-       -c "id=1234" \
-       -c "broker_endpoint='http://storage_broker:50051'" \
-       -c "pg_distrib_dir='/usr/local/'" \
-       -c "listen_pg_addr='0.0.0.0:6400'" \
-       -c "listen_http_addr='0.0.0.0:9898'"
+RUN mkdir -p /data/.neon/ && \
+  echo "id=1234" > "/data/.neon/identity.toml" && \
+  echo "broker_endpoint='http://storage_broker:50051'\n" \
+       "pg_distrib_dir='/usr/local/'\n" \
+       "listen_pg_addr='0.0.0.0:6400'\n" \
+       "listen_http_addr='0.0.0.0:9898'\n" \
+  > /data/.neon/pageserver.toml && \
+  chown -R neon:neon /data/.neon

 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
@@ -110,3 +111,6 @@ VOLUME ["/data"]
 USER neon
 EXPOSE 6400
 EXPOSE 9898
+
+CMD /usr/local/bin/pageserver -D /data/.neon
+
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+COPY patches/rum.patch /rum.patch
+
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /rum.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
--- a/13
+++ b/13
@@ -69,6 +69,8 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
 # Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

+CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
+
 #
 # Top level Makefile to build Neon and PostgreSQL
 #
@@ -79,15 +81,24 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib
+neon: postgres-headers walproposer-lib cargo-target-dir
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
+.PHONY: cargo-target-dir
+cargo-target-dir:
+	# https://github.com/rust-lang/cargo/issues/14281
+	mkdir -p target
+	test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG

 ### PostgreSQL parts
 # Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
+
+	mkdir -p $(POSTGRES_INSTALL_DIR)
+	test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
+
 	+@echo "Configuring Postgres $* build"
 	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
 		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
+rlimit = "0.10.1"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!  and download `shared_preload_libraries` from the remote storage.
+//!   and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,7 +33,6 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
-//!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
@@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
+use rlimit::{setrlimit, Resource};

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    let (build_tag, clap_args) = init()?;

+    // enable core dumping for all child processes
+    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
+
    let (pg_handle, start_pg_result) = {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,6 +56,7 @@ pub struct ComputeNode {
    /// - we push new spec and it does reconfiguration
    /// - but then something happens and compute pod / VM is destroyed,
    ///   so k8s controller starts it again with the **old** spec
+    ///
    /// and the same for empty computes:
    /// - we started compute without any spec
    /// - we push spec and it does configuration
@@ -1116,7 +1117,7 @@ impl ComputeNode {
    // EKS worker nodes have following core dump settings:
    //   /proc/sys/kernel/core_pattern -> core
    //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimint -c -> unlimited
+    //   ulimit -c -> unlimited
    // which results in core dumps being written to postgres data directory as core.<pid>.
    //
    // Use that as a default location and pattern, except macos where core dumps are written
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> {

 impl<'m> MigrationRunner<'m> {
    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
+        assert!(migrations.len() + 1 < i64::MAX as usize);
+
        Self { client, migrations }
    }

@@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> {
        Ok(row.get::<&str, i64>("id"))
    }

-    fn update_migration_id(&mut self) -> Result<()> {
-        let setval = format!(
-            "UPDATE neon_migration.migration_id SET id={}",
-            self.migrations.len()
-        );
+    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);

        self.client
            .simple_query(&setval)
@@ -57,44 +57,49 @@ impl<'m> MigrationRunner<'m> {
    pub fn run_migrations(mut self) -> Result<()> {
        self.prepare_migrations()?;

-        let mut current_migration: usize = self.get_migration_id()? as usize;
-        let starting_migration_id = current_migration;
-
-        let query = "BEGIN";
-        self.client
-            .simple_query(query)
-            .context("run_migrations begin")?;
-
+        let mut current_migration = self.get_migration_id()? as usize;
        while current_migration < self.migrations.len() {
+            macro_rules! migration_id {
+                ($cm:expr) => {
+                    ($cm + 1) as i64
+                };
+            }
+
            let migration = self.migrations[current_migration];

            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", current_migration);
+                info!("Skipping migration id={}", migration_id!(current_migration));
            } else {
                info!(
                    "Running migration id={}:\n{}\n",
-                    current_migration, migration
+                    migration_id!(current_migration),
+                    migration
                );
+
+                self.client
+                    .simple_query("BEGIN")
+                    .context("begin migration")?;
+
                self.client.simple_query(migration).with_context(|| {
-                    format!("run_migration current_migration={}", current_migration)
+                    format!(
+                        "run_migrations migration id={}",
+                        migration_id!(current_migration)
+                    )
                })?;
+
+                // Migration IDs start at 1
+                self.update_migration_id(migration_id!(current_migration))?;
+
+                self.client
+                    .simple_query("COMMIT")
+                    .context("commit migration")?;
+
+                info!("Finished migration id={}", migration_id!(current_migration));
            }

            current_migration += 1;
        }

-        self.update_migration_id()?;
-
-        let query = "COMMIT";
-        self.client
-            .simple_query(query)
-            .context("run_migrations commit")?;
-
-        info!(
-            "Ran {} migrations",
-            (self.migrations.len() - starting_migration_id)
-        );
-
        Ok(())
    }
 }
--- a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
--- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -0,0 +1,7 @@
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
+    END IF;
+END $$;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -777,19 +777,22 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0001-alter_roles.sql"),
-        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0002-alter_roles.sql"),
+        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
        include_str!(
-            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
        ),
        include_str!(
-            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+        ),
+        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
+        include_str!(
+            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
        ),
-        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

    MigrationRunner::new(client, &migrations).run_migrations()?;
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true
+whoami.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,9 +1,9 @@
 //! Code to manage the storage broker
 //!
-//! In the local test environment, the data for each safekeeper is stored in
+//! In the local test environment, the storage broker stores its data directly in
 //!
 //! ```text
-//!   .neon/safekeepers/<safekeeper id>
+//!   .neon
 //! ```
 use std::time::Duration;

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,8 +1,10 @@
 //! Code to manage pageservers
 //!
-//! In the local test environment, the pageserver stores its data directly in
+//! In the local test environment, the data for each pageserver is stored in
 //!
-//!   .neon/
+//! ```text
+//!   .neon/pageserver_<pageserver_id>
+//! ```
 //!
 use std::collections::HashMap;

@@ -23,6 +25,7 @@ use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::auth::{Claims, Scope};
+use utils::id::NodeId;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -72,6 +75,10 @@ impl PageServerNode {
        }
    }

+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
+        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
+    }
+
    fn pageserver_init_make_toml(
        &self,
        conf: NeonLocalInitPageserverConf,
@@ -184,6 +191,19 @@ impl PageServerNode {
            .write_all(config.to_string().as_bytes())
            .context("write pageserver toml")?;
        drop(config_file);
+
+        let identity_file_path = datadir.join("identity.toml");
+        let mut identity_file = std::fs::OpenOptions::new()
+            .create_new(true)
+            .write(true)
+            .open(identity_file_path)
+            .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
+        let identity_toml = self.pageserver_make_identity_toml(node_id);
+        identity_file
+            .write_all(identity_toml.to_string().as_bytes())
+            .context("write identity toml")?;
+        drop(identity_toml);
+
        // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config

        // Write metadata file, used by pageserver on startup to register itself with
@@ -349,11 +369,6 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-            trace_read_requests: settings
-                .remove("trace_read_requests")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -454,11 +469,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-                trace_read_requests: settings
-                    .remove("trace_read_requests")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'trace_read_requests' as bool")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -29,7 +29,6 @@ use utils::{
 pub struct StorageController {
    env: LocalEnv,
    listen: String,
-    path: Utf8PathBuf,
    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
    postgres_port: u16,
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

+const DB_NAME: &str = "storage_controller";
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -65,10 +66,6 @@ pub struct InspectResponse {

 impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
-        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
-            .unwrap()
-            .join("attachments.json");
-
        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
        let listen_url = env.control_plane_api.clone().unwrap();
@@ -128,7 +125,6 @@ impl StorageController {

        Self {
            env: env.clone(),
-            path,
            listen,
            private_key,
            public_key,
@@ -203,7 +199,6 @@ impl StorageController {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "storage_controller";
        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -232,6 +227,30 @@ impl StorageController {
        Ok(database_url)
    }

+    pub async fn connect_to_database(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
+        tokio_postgres::Config::new()
+            .host("localhost")
+            .port(self.postgres_port)
+            // The user is the ambient operating system user name.
+            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
+            //
+            // Until we get there, use the ambient operating system user name.
+            // Recent tokio-postgres versions default to this if the user isn't specified.
+            // But tokio-postgres fork doesn't have this upstream commit:
+            // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
+            // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
+            .user(&whoami::username())
+            .dbname(DB_NAME)
+            .connect(tokio_postgres::NoTls)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -256,18 +275,21 @@ impl StorageController {
            if !status.success() {
                anyhow::bail!("initdb failed with status {status}");
            }
-
-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
-            tokio::fs::write(
-                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", self.postgres_port),
-            )
-            .await?;
        };

+        // Write a minimal config file:
+        // - Specify the port, since this is chosen dynamically
+        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+        //   the storage controller we don't want a slow local disk to interfere with that.
+        //
+        // NB: it's important that we rewrite this file on each start command so we propagate changes
+        // from `LocalEnv`'s config file (`.neon/config`).
+        tokio::fs::write(
+            &pg_data_path.join("postgresql.conf"),
+            format!("port = {}\nfsync=off\n", self.postgres_port),
+        )
+        .await?;
+
        println!("Starting storage controller database...");
        let db_start_args = [
            "-w",
@@ -296,11 +318,38 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

+        // We support running a startup SQL script to fiddle with the database before we launch storcon.
+        // This is used by the test suite.
+        let startup_script_path = self
+            .env
+            .base_data_dir
+            .join("storage_controller_db.startup.sql");
+        let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
+            Ok(script) => {
+                tokio::fs::remove_file(startup_script_path).await?;
+                script
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // always run some startup script so that this code path doesn't bit rot
+                    "BEGIN; COMMIT;".to_string()
+                } else {
+                    anyhow::bail!("Failed to read startup script: {e}")
+                }
+            }
+        };
+        let (mut client, conn) = self.connect_to_database().await?;
+        let conn = tokio::spawn(conn);
+        let tx = client.build_transaction();
+        let tx = tx.start().await?;
+        tx.batch_execute(&startup_script).await?;
+        tx.commit().await?;
+        drop(client);
+        conn.await??;
+
        let mut args = vec![
            "-l",
            &self.listen,
-            "-p",
-            self.path.as_ref(),
            "--dev",
            "--database-url",
            &database_url,
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -17,6 +17,7 @@ pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
+storage_controller_client.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -14,15 +14,15 @@ use pageserver_api::{
    },
    shard::{ShardStripeSize, TenantShardId},
 };
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use pageserver_client::mgmt_api::{self};
 use reqwest::{Method, StatusCode, Url};
-use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};

 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use storage_controller_client::control_api::Client;

 #[derive(Subcommand, Debug)]
 enum Command {
@@ -56,6 +56,10 @@ enum Command {
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
+    NodeDelete {
+        #[arg(long)]
+        node_id: NodeId,
+    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
@@ -245,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -337,7 +283,7 @@ async fn main() -> anyhow::Result<()> {
        }
        Command::TenantCreate { tenant_id } => {
            storcon_client
-                .dispatch(
+                .dispatch::<_, ()>(
                    Method::POST,
                    "v1/tenant".to_string(),
                    Some(TenantCreateRequest {
@@ -357,13 +303,16 @@ async fn main() -> anyhow::Result<()> {
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
+
+            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
+
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
@@ -395,13 +344,16 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::Tenants {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
+
+            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
+
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
@@ -650,6 +602,11 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                .await?;
        }
+        Command::NodeDelete { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
+                .await?;
+        }
        Command::TenantSetTimeBasedEviction {
            tenant_id,
            period,
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -33,7 +33,7 @@ echo $result | jq .

 generate_id timeline_id
 PARAMS=(
-     -sb 
+     -sbf
     -X POST
     -H "Content-Type: application/json"
     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -31,25 +31,14 @@ services:
    restart: always
    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
-      - BROKER_ENDPOINT='http://storage_broker:50051'
      - AWS_ACCESS_KEY_ID=minio
      - AWS_SECRET_ACCESS_KEY=password
      #- RUST_BACKTRACE=1
    ports:
       #- 6400:6400  # pg protocol handler
       - 9898:9898 # http endpoints
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command:
-      - "/usr/local/bin/pageserver -D /data/.neon/
-                                   -c \"broker_endpoint=$$BROKER_ENDPOINT\"
-                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
-                                   -c \"listen_http_addr='0.0.0.0:9898'\"
-                                   -c \"remote_storage={endpoint='http://minio:9000',
-                                                        bucket_name='neon',
-                                                        bucket_region='eu-north-1',
-                                                        prefix_in_bucket='/pageserver/'}\""
+    volumes:
+      - ./pageserver_config:/data/.neon/
    depends_on:
      - storage_broker
      - minio_create_buckets
--- a/docker-compose/pageserver_config/identity.toml
+++ b/docker-compose/pageserver_config/identity.toml
@@ -0,0 +1 @@
+id=1234
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -0,0 +1,5 @@
+broker_endpoint='http://storage_broker:50051'
+pg_distrib_dir='/usr/local/'
+listen_pg_addr='0.0.0.0:6400'
+listen_http_addr='0.0.0.0:9898'
+remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
--- a/docs/rfcs/034-ancestor-deletion.md
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -0,0 +1,252 @@
+# Ancestor Timeline Deletion
+
+Created on: 2024-02-23
+
+Author: John Spray
+
+# Summary
+
+When a tenant creates a new timeline that they will treat as their 'main' history,
+it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
+this is necessary because it is forbidden to delete a timeline which has descendents.
+
+A new pageserver API is proposed to 'adopt' data from a parent timeline into
+one of its children, such that the link between ancestor and child can be severed,
+leaving the parent in a state where it may then be deleted.
+
+# Motivation
+
+Retaining parent timelines currently has two costs:
+
+- Cognitive load on users, who have to remember which is the "real" main timeline.
+- Storage capacity cost, as the parent timeline will retain layers up to the
+  child's timeline point, even if the child fully covers its keyspace with image
+  layers and will never actually read from the parent.
+
+# Solution
+
+A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
+will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
+wish to detach from its parent.
+
+On success, this API will leave the following state:
+
+- The detached child timeline will no longer have an ancestor, and will contain all
+  the data needed to service reads without recursing into an ancestor.
+- Any other children of the parent whose timeline points were at a lower LSN than
+  the detached child timeline will be modified to have the child timeline as their
+  new parent.
+- The parent timeline will still exist, but the child will no longer have it as an
+  ancestor. If this was the last timeline that depended on the parent, then the
+  parent will become deletable.
+
+This API's implementation will consist of a series of retryable steps, such that
+on failures/timeout it can safely be called again to reach the target state.
+
+## Example
+
+### Before
+
+The user has "rolled back" their project to LSN X, resulting in a "new main"
+timeline. The parent "old main" timeline still exists, and they would like
+to clean it up.
+
+They have two other timelines A and B. A is from before the rollback point,
+and B is from after the rollback point.
+
+```
+----"old main" timeline-------X-------------------------------------------->
+                |             |                         |
+                |-> child A   |                         |
+                              |-> "new main" timeline   |
+                                                        -> child B
+
+```
+
+### After calling detach ancestor API
+
+The "new main" timeline is no longer dependent on old main, and neither
+is child A, because it had a branch point before X.
+
+The user may now choose to delete child B and "old main" to get to
+a pristine state. Child B is likely to be unwanted since the user
+chose to roll back to X, and it branches from after X. However, we
+don't assume this in the API; it is up to the user to delete it.
+
+```
+|----"old main" timeline---------------------------------------------------->
+                                                         |
+                                                         |
+                                                         |
+                                                         -> child B
+
+|----"new main" timeline--------->
+                 |
+                 |-> child A
+
+
+```
+
+### After removing timelines
+
+We end up with a totally clean state that leaves no trace that a rollback
+ever happened: there is only one root timeline.
+
+```
+| ----"new main" timeline----------->
+                |
+                |-> child A
+
+
+```
+
+## Caveats
+
+Important things for API users to bear in mind:
+
+- this API does not delete the parent timeline: you must still do that explicitly.
+- if there are other child timelines ahead of the branch point of the detached
+  child, the parent won't be deletable: you must either delete or detach those
+  children.
+- do _not_ simply loop over all children and detach them all: this can have an
+  extremely high storage cost. The detach ancestor API is intended for use on a single
+  timeline to make it the new "main".
+- The detach ancestor API should also not be
+  exposed directly to the user as button/API, because they might decide
+  to click it for all the children and thereby generate many copies of the
+  parent's data -- the detach ancestor API should be used as part
+  of a high level "clean up after rollback" feature.
+
+## `detach_ancestor` API implementation
+
+Terms used in the following sections:
+
+- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
+  called "new main" in the example.
+- "the parent": the parent of "the child". Also called "old main" in the example.
+- "the branch point" the ancestor_lsn of "the child"
+
+### Phase 1: write out adopted layers to S3
+
+The child will "adopt" layers from the parent, such that its end state contains
+all the parent's history as well as its own.
+
+For all layers in the parent's layer map whose high LSN is below the branch
+point, issue S3 CopyObject requests to duplicate them into the child timeline's
+prefix. Do not add them to the child's layer map yet.
+
+For delta layers in the parent's layer map which straddle the branch point, read them
+and write out only content up to the branch point into new layer objects.
+
+This is a long running operation if the parent has many layers: it should be
+implemented in a way that resumes rather than restarting from scratch, if the API
+times out and is called again.
+
+As an optimization, if there are no other timelines that will be adopted into
+the child, _and_ the child's image layers already full cover the branch LSN,
+then we may skip adopting layers.
+
+### Phase 2: update the child's index
+
+Having written out all needed layers in phase 1, atomically link them all
+into the child's IndexPart and upload to S3. This may be done while the
+child Timeline is still running.
+
+### Phase 3: modify timelines ancestry
+
+Modify the child's ancestor to None, and upload its IndexPart to persist the change.
+
+For all timelines which have the same parent as the child, and have a branch
+point lower than our branch point, switch their ancestor_timeline to the child,
+and upload their IndexPart to persist the change.
+
+## Alternatives considered
+
+### Generate full image layer on child, rather than adopting parent deltas
+
+This would work for the case of a single child, but would prevent re-targeting
+other timelines that depended on the parent. If we detached many children this
+way, the storage cost would become prohibitive (consider a 1TB database with
+100 child timelines: it would cost 100TiB if they all generated their own image layers).
+
+### Don't rewrite anything: just fake it in the API
+
+We could add a layer of indirection that let a child "pretend" that it had no
+ancestor, when in reality it still had the parent. The pageserver API could
+accept deletion of ancestor timelines, and just update child metadata to make
+them look like they have no ancestor.
+
+This would not achieve the desired reduction in storage cost, and may well be more
+complex to maintain than simply implementing the API described in this RFC.
+
+### Avoid copying objects: enable child index to use parent layers directly
+
+We could teach IndexPart to store a TimelineId for each layer, such that a child
+timeline could reference a parent's layers directly, rather than copying them
+into the child's prefix.
+
+This would impose a cost for the normal case of indices that only target the
+timeline's own layers, add complexity, and break the useful simplifying
+invariant that timelines "own" their own path. If child timelines were
+referencing layers from the parent, we would have to ensure that the parent
+never runs GC/compaction again, which would make the API less flexible (the
+proposal in this RFC enables deletion of the parent but doesn't require it.)
+
+## Performance
+
+### Adopting layers
+
+- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
+  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
+  semaphore units with other activity on the pageserver.
+- If we are running on storage backend that doesn't implement CopyObject, then
+  this part will be much more expensive as we would stream all layer content
+  through the pageserver. This is no different to issuing a lot
+  of reads to a timeline that does not have a warm local cache: it will move
+  a lot of gigabytes, but that shouldn't break anything.
+- Generating truncated layers for delta that straddle the branch point will
+  require streaming read/write of all the layers in question.
+
+### Updating timeline ancestry
+
+The simplest way to update timeline ancestry will probably be to stop and start
+all the Timeline objects: this is preferable to the complexity of making their
+ancestry mutable at runtime.
+
+There will be a corresponding "stutter" in the availability of the timelines,
+of the order 10-100ms, which is the time taken to upload their IndexPart, and
+restart the Timeline.
+
+# Interaction with other features
+
+## Concurrent timeline creation
+
+If new historic timelines are created using the parent as an ancestor while the
+detach ancestor API is running, they will not be re-parented to the child. This
+doesn't break anything, but it leaves the parent in a state where it might not
+be possible to delete it.
+
+Since timeline creations are an explicit user action, this is not something we need to
+worry about as the storage layer: a user who wants to delete their parent timeline will not create
+new children, and if they do, they can choose to delete those children to
+enable deleting the parent.
+
+For the least surprise to the user, before starting the detach ancestor branch
+operation, the control plane should wait until all branches are created and not
+allow any branches to be created before the branch point on the ancestor branch
+while the operation is ongoing.
+
+## WAL based disaster recovery
+
+WAL based disaster recovery currently supports only restoring of the main
+branch. Enabling WAL based disaster recovery in the future requires that we
+keep a record which timeline generated the WAL and at which LSN was a parent
+detached. Keep a list of timeline ids and the LSN in which they were detached in
+the `index_part.json`. Limit the size of the list to 100 first entries, after
+which the WAL disaster recovery will not be possible.
+
+## Sharded tenants
+
+For sharded tenants, calls to the detach ancestor API will pass through the storage
+controller, which will handle them the same as timeline creations: invoke first
+on shard zero, and then on all the other shards.
--- a/docs/rfcs/035-timeline-archive.md
+++ b/docs/rfcs/035-timeline-archive.md
@@ -0,0 +1,507 @@
+# Timeline Archival
+
+## Summary
+
+This RFC describes a mechanism for pageservers to eliminate local storage + compute work
+for timelines which are not in use, in response to external API calls to "archive" a timeline.
+
+The archived state roughly corresponds to fully offloading a timeline to object storage, such
+that its cost is purely the cost of that object storage.
+
+## Motivation
+
+Archived timelines serve multiple purposes:
+- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
+  database from longer ago than their PITR window.
+- Enable users to create huge numbers of branches (e.g. one per github PR) without having
+  to diligently clean them up later to avoid overloading the pageserver (currently we support
+  up to ~500 branches per tenant).
+
+### Prior art
+
+Most storage and database systems have some form of snapshot, which can be implemented several ways:
+1. full copies of data (e.g. an EBS snapshot to S3)
+2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
+3. a series of snapshots which are CoW or de-duplicated relative to one another.
+
+Today's Neon branches are approximately like `2.`, although due to implementation details branches
+often end up storing much more data than they really need, as parent branches assume that all data
+at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
+than the physical size of a compressed image layer representing the data at the branch point.
+
+## Requirements
+
+- Enter & exit the archived state in response to external admin API calls
+- API calls to modify the archived state are atomic and durable
+- An archived timeline should eventually (once out of PITR window) use an efficient compressed
+  representation, and avoid retaining arbitrarily large data in its parent branch.
+- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
+  but must not scale with the number of _archived_ branches.
+- Background I/O for archived branches should only be done a limited number of times to evolve them
+  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
+  overhead for archived branches, including operations related to calculating sizes for billing.
+- The pageserver should put no load on the safekeeper for archived branches.
+- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
+  to a performant state in a short time (linear with the branch's logical size)
+
+## Non Goals
+
+- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
+  in Neon's internal format.
+- Compute cold starts after activating an archived branch will not have comparable performance to
+  cold starts on an active branch.
+- Archived branches will not use any new/additional compression or de-duplication beyond what
+  is already implemented for image layers (zstd per page).
+- The pageserver will not "auto start" archived branches in response to page_service API requests: they
+  are only activated explicitly via the HTTP API.
+- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
+  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
+- We will not expose any prometheus metrics for archived timelines, or make them visible in any
+  detailed HTTP APIs other than the specific API for listing archived timelines.
+- A parent branch may not be archived unless all its children are.
+
+## Impacted Components
+
+pageserver, storage controller
+
+## Terminology
+
+**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
+may assume that this branch is now very cheap to store, although this may not be physically so until the
+branch proceeds to the offloaded state.
+
+**Active** branches are branches which are available for use by page_service clients, and have a relatively
+high cost due to consuming local storage.
+
+**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
+that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
+
+**Activate** (verb): transition from Archived to Active
+
+**Archive** (verb): transition from Active to Archived
+
+**Offload** (verb): transition from Archived to Offloaded
+
+**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
+
+**Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
+warmed up, good performance will be available to page_service clients.
+
+## Implementation
+
+### High level flow
+
+We may think of a timeline which is archived and then activated as proceeding through a series of states:
+
+```mermaid
+stateDiagram
+  [*] --> Active(warm)
+  Active(warm) --> Archived
+  Archived --> Offloaded
+  Archived --> Active(warm)
+  Offloaded --> Active(cold)
+  Active(cold) --> Active(warm)
+```
+
+Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
+of branches will be:
+- Very frequent: Short lived branches: Active -> Deleted
+- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
+- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
+
+These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
+of:
+- the timeline's lifecycle state: active or archived, stored in the timeline's index
+- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
+  manifest of offloaded timelines.
+- cache state (whether it's warm or cold).
+
+### Storage format changes
+
+There are two storage format changes:
+1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
+   be considered active or archived.
+2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
+   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
+
+The manifest object will have a format like this:
+```
+{
+  "offload_timelines": [
+    {
+      "timeline_id": ...
+      "last_record_lsn": ...
+      "last_record_lsn_time": ...
+      "pitr_interval": ...
+      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
+      "logical_size": ...  # The size at last_record_lsn
+      "physical_size" ...
+      "parent": Option<{
+        "timeline_id"...
+        "lsn"... # Branch point LSN on the parent
+        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
+
+      }>
+    }
+  ]
+}
+```
+
+The information about a timeline in its offload state is intentionally minimal: just enough to decide:
+- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
+  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
+- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
+  layers that the archived branch depends on
+- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
+  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
+  we don't need to go to S3 for the deletion.
+- How much archived space to report in consumption metrics
+
+The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
+set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
+(offloaded timelines).
+
+For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
+index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
+give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
+for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
+the manifest file.
+
+### API & Timeline state
+
+Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
+be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
+may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
+a per-timeline configuration).
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
+```
+{
+  'state': 'active|archive'
+}
+```
+
+When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
+
+When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
+**and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
+index, but not any data: it should be about as fast as a couple of small S3 requests.
+
+The API will be available with identical path via the storage controller: calling this on a sharded tenant
+will simply map the API call to all the shards.
+
+Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
+such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
+that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
+in the proper order if they would like to archive whole trees of branches.
+
+Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
+for archived timelines will be added: this is for use in support/debug:
+
+```
+GET /v1/tenants/{tenant_id}/archived_timelines
+
+{
+  ...same per-timeline content as the tenant manifest...
+}
+
+```
+
+### Tenant attach changes
+
+Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
+we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
+timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
+number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
+because each request covers 1000 timelines.
+
+This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
+the set of timelines which have been offloaded in the background after their state was set to archived.
+
+We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
+exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
+to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
+to delete an offloaded timeline.
+
+### Warm-up API
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
+
+This API will be similar to the existing `download_remote_layers` API, but smarter:
+- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
+- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
+  of downloads, so that the caller can poll.
+
+The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
+of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
+can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
+eviction and heatmaps, as well as in this specific case of warming up a timeline.
+
+The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
+to call it, because otherwise populating local contents for a timeline can take a long time when waiting
+for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
+volatile.
+
+### Background work
+
+Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
+an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
+([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
+if its state permits that.
+
+Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
+optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
+has elapsed and it can now be rewritten to image layers.
+
+#### Archive branch offload
+
+Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
+any actual work.
+
+This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
+loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
+
+The condition for offload is simple:
+ - a `Timeline` object exists with state `Archived`
+ - the timeline does not have any non-offloaded children.
+ 
+ Regarding the condition that children must be offloaded, this will always be eventually true, because
+ we enforce at the API level that children of archived timelines must themselves be archived, and all
+ archived timelines will eventually be offloaded.
+
+Offloading a timeline is simple:
+- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
+- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
+- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
+- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
+
+#### Archive branch optimization (flattening)
+
+When we offloaded a branch, it might have had some history that prevented rewriting it to a single
+point in time set of image layers.  For example, a branch might have several days of writes and a 7
+day PITR: when we archive it, it still has those days of history.
+
+Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
+- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
+  a point in time compared with delta layers
+- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
+  for data, i.e. the ancestor is free to GC layers files at+below the branch point
+
+Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
+branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
+a true snapshot at that LSN.
+
+It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
+is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
+
+Archive branch optimization should be done _before_ background offloads during compaction, because there may
+be timelines which are ready to be offloaded but also would benefit from the optimization step before
+being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
+of its own may be immediately re-written as a series of image layers before being offloaded.
+
+### Consumption metrics
+
+Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
+that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
+vs. ordinary content.
+
+Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
+variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
+
+### Secondary locations
+
+Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
+when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
+will be dropped from secondary locations.
+
+### Sharding
+
+Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
+the same way that timeline creation and deletion is done.  There are no special rules about ordering:
+the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
+
+Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
+will be authoritative for consumption metrics.
+
+## Error cases
+
+### Errors in sharded tenants
+
+If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
+state, where a timeline is archived on some shards but not on others.  
+
+We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
+are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
+In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
+up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
+state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
+break anything, it's just "weird".
+
+This is similar to the status quo for timeline creation and deletion: callers are expected to retry
+these operations until they succeed.
+
+### Archiving/activating
+
+Archiving/activating a timeline can fail in a limited number of ways:
+1. I/O error storing/reading the timeline's updated index
+    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
+      storage errors are always transient. 
+2. NotFound if the timeline doesn't exist
+    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
+    - The storage controller has runtime locking to prevent races such as deleting a timeline while
+      archiving it.
+3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
+    - Callers are expected to do their own checks to avoid hitting this case.  If they make
+      a mistake and encounter this error, they should give up.
+
+### Offloading
+
+Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
+tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
+again at the next iteration of the compaction loop.
+
+### Archive branch optimization
+
+Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
+can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
+the next iteration of the compaction loop.
+
+## Optimizations
+
+### Delaying storage optimization if retaining parent layers is cheaper
+
+Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
+is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
+are offloaded to S3 they're totally safe, inert things.
+
+However, in some cases it can be advantageous to retain extra history on their parent branch rather
+than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
+of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
+for each nightly branch is inefficient compared with just keeping more history on the main branch.
+
+Getting this right requires consideration of:
+- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
+  write out extra image layers, then it might make more sense to just write out the image layers on
+  the archived branch.
+- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
+  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
+  large layer map can cause problems elsewhere.
+
+This optimization can probably be implemented quite cheaply with some basic heuristics like:
+- don't bother doing optimization on an archive branch if the LSN distance between
+  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
+- ...but, Don't keep more history on the main branch than double the PITR
+
+### Creating a timeline in archived state (a snapshot)
+
+Sometimes, one might want to create a branch with no history, which will not be written to
+before it is archived.  This is a snapshot, although we do not require a special snapshot API,
+since a snapshot can be represented as a timeline with no history.
+
+This can be accomplished by simply creating a timeline and then immediately archiving it, but
+that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
+broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
+support this common special case, we may add a parameter to the timeline creation API which
+creates a timeline directly into the archived state.
+
+Such a timeline creation will do exactly two I/Os at creation time:
+- write the index_part object to record the timeline's existence
+- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
+  write the tenant manifest.
+
+Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
+up the 'snapshot' branch and write out image layers.
+
+## Future Work
+
+### Enabling `fullbackup` dumps from archive branches
+
+It would be useful to be able to export an archive branch to another system, or for use in a local
+postgres database.
+
+This could be implemented as a general capability for all branches, in which case it would "just work"
+for archive branches by activating them.  However, downloading all the layers in a branch just to generate
+a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
+which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
+
+Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
+is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
+stream to S3 in an intermediate format and, then having one node stitch them together).
+
+### Tagging layers from archived branches
+
+When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
+we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
+cheaper storage.
+
+This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
+external hints on which branches are likely to be reactivated, and which branches are good candidates for
+tagging for low performance storage.
+
+Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
+stores have similar mechanisms.
+
+### Storing sequences of archive branches as deltas
+
+When archived branches are used as scheduled snapshots, we could store them even more efficiently
+by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
+storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
+pages). This is the kind of encoding that many backup storage systems use.
+
+The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
+vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
+copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
+so the complexity tradeoff of diff-encoding it is dubious).
+
+One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
+pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
+we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
+so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
+delta snapshot".
+
+Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
+each other: perhaps this would be done by making the archive branches have child/parent relationships with
+each other, or perhaps we would permit them to remain children of their original parent, but additionally
+have a relationship with the snapshot they're encoded relative to.
+
+Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
+out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
+we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
+
+
+## FAQ/Alternatives
+
+### Store all timelines in the tenant manifest
+
+Rather than special-casing offloaded timelines in the offload manifest, we could store a total
+manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
+startup.
+
+That would be a more invasive change (require hooking in to timeline creation), and would
+generate much more I/O to this manifest for tenants that had many branches _and_ frequent
+create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
+means that we only have to cope with the rate at which long-lived timelines are archived, rather
+than the rate at which sort lived timelines are created & destroyed.
+
+### Automatically archiving/activating timelines without external API calls
+
+We could implement TTL driven offload of timelines, waking them up when a page request
+arrives.
+
+This has downsides:
+- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
+  know which of their branches are in this state, and might get a surprise when they try
+  to use such a branch.
+- Price fluctuation: if the archival of a branch is used in end user pricing, then users
+  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
+  is created, rather than having a usage-dependency storage price.
+- Complexity: enabling the page service to call up into the Tenant to activate a timeline
+  would be awkward, compared with an external entry point.
+
+### Make offloaded a state of Timeline
+
+To reduce the operator-facing complexity of having some timelines APIs that only return
+non-offloaded timelines, we could build the offloaded state into the Timeline type.
+
+`timeline.rs` is already one of the most egregiously long source files in the tree, so
+this is rejected on the basis that we need to avoid making that complexity worse.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponse {
    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
    pub listen_pg_port: u16,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -5,7 +5,6 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;

 use std::{
-    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
@@ -20,7 +19,6 @@ use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use utils::{
    completion,
-    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
    serde_system_time,
@@ -294,7 +292,6 @@ pub struct TenantConfig {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
-    pub trace_read_requests: Option<bool>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
@@ -652,6 +649,17 @@ pub struct TenantDetails {
    pub timelines: Vec<TimelineId>,
 }

+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
+pub enum TimelineArchivalState {
+    Archived,
+    Unarchived,
+}
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+pub struct TimelineArchivalConfigRequest {
+    pub state: TimelineArchivalState,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -716,58 +724,7 @@ pub struct LayerMapInfo {
    pub historic_layers: Vec<HistoricLayerInfo>,
 }

-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
-#[repr(usize)]
-pub enum LayerAccessKind {
-    GetValueReconstructData,
-    Iter,
-    KeyIter,
-    Dump,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LayerAccessStatFullDetails {
-    pub when_millis_since_epoch: u64,
-    pub task_kind: Cow<'static, str>,
-    pub access_kind: LayerAccessKind,
-}
-
-/// An event that impacts the layer's residence status.
-#[serde_as]
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LayerResidenceEvent {
-    /// The time when the event occurred.
-    /// NB: this timestamp is captured while the residence status changes.
-    /// So, it might be behind/ahead of the actual residence change by a short amount of time.
-    ///
-    #[serde(rename = "timestamp_millis_since_epoch")]
-    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
-    pub timestamp: SystemTime,
-    /// The new residence status of the layer.
-    pub status: LayerResidenceStatus,
-    /// The reason why we had to record this event.
-    pub reason: LayerResidenceEventReason,
-}
-
-/// The reason for recording a given [`LayerResidenceEvent`].
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub enum LayerResidenceEventReason {
-    /// The layer map is being populated, e.g. during timeline load or attach.
-    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
-    /// We need to record such events because there is no persistent storage for the events.
-    ///
-    // https://github.com/rust-lang/rust/issues/74481
-    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
-    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
-    LayerLoad,
-    /// We just created the layer (e.g., freeze_and_flush or compaction).
-    /// Such layers are always [`LayerResidenceStatus::Resident`].
-    LayerCreate,
-    /// We on-demand downloaded or evicted the given layer.
-    ResidenceChange,
-}
-
-/// The residence status of the layer, after the given [`LayerResidenceEvent`].
+/// The residence status of a layer
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceStatus {
    /// Residence status for a layer file that exists locally.
@@ -777,23 +734,16 @@ pub enum LayerResidenceStatus {
    Evicted,
 }

-impl LayerResidenceEvent {
-    pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
-        Self {
-            status,
-            reason,
-            timestamp: SystemTime::now(),
-        }
-    }
-}
-
+#[serde_as]
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
-    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<Cow<'static, str>>,
-    pub first: Option<LayerAccessStatFullDetails>,
-    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
-    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
+    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
+    pub access_time: SystemTime,
+
+    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
+    pub residence_time: SystemTime,
+
+    pub visible: bool,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;

-#[derive(Default, serde::Serialize)]
+#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
+async-stream.workspace = true
 once_cell.workspace = true
 aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -15,7 +15,7 @@ use std::time::SystemTime;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::RetryOptions;
+use azure_core::{Continuable, RetryOptions};
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
@@ -40,6 +40,7 @@ use crate::{

 pub struct AzureBlobStorage {
    client: ContainerClient,
+    container_name: String,
    prefix_in_container: Option<String>,
    max_keys_per_list_response: Option<NonZeroU32>,
    concurrency_limiter: ConcurrencyLimiter,
@@ -85,6 +86,7 @@ impl AzureBlobStorage {

        Ok(AzureBlobStorage {
            client,
+            container_name: azure_config.container_name.to_owned(),
            prefix_in_container: azure_config.prefix_in_container.to_owned(),
            max_keys_per_list_response,
            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
@@ -238,6 +240,10 @@ impl AzureBlobStorage {
            _ = cancel.cancelled() => Err(Cancelled),
        }
    }
+
+    pub fn container_name(&self) -> &str {
+        &self.container_name
+    }
 }

 fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
@@ -261,30 +267,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
 }

 impl RemoteStorage for AzureBlobStorage {
-    async fn list(
+    fn list_streaming(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> anyhow::Result<Listing, DownloadError> {
-        let _permit = self.permit(RequestKind::List, cancel).await?;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let list_prefix = prefix
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
+            });

-        let op = async {
-            // get the passed prefix or if it is not set use prefix_in_bucket value
-            let list_prefix = prefix
-                .map(|p| self.relative_path_to_name(p))
-                .or_else(|| self.prefix_in_container.clone())
-                .map(|mut p| {
-                    // required to end with a separator
-                    // otherwise request will return only the entry of a prefix
-                    if matches!(mode, ListingMode::WithDelimiter)
-                        && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                    {
-                        p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                    }
-                    p
-                });
+        async_stream::stream! {
+            let _permit = self.permit(RequestKind::List, cancel).await?;

            let mut builder = self.client.list_blobs();

@@ -300,21 +306,43 @@ impl RemoteStorage for AzureBlobStorage {
                builder = builder.max_results(MaxResults::new(limit));
            }

-            let response = builder.into_stream();
-            let response = response.into_stream().map_err(to_download_error);
-            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
-            let response = response.map(|res| match res {
-                Ok(res) => res,
-                Err(_elapsed) => Err(DownloadError::Timeout),
-            });
+            let mut next_marker = None;

-            let mut response = std::pin::pin!(response);
+            'outer: loop {
+                let mut builder = builder.clone();
+                if let Some(marker) = next_marker.clone() {
+                    builder = builder.marker(marker);
+                }
+                let response = builder.into_stream();
+                let response = response.into_stream().map_err(to_download_error);
+                let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+                let response = response.map(|res| match res {
+                    Ok(res) => res,
+                    Err(_elapsed) => Err(DownloadError::Timeout),
+                });

-            let mut res = Listing::default();
+                let mut response = std::pin::pin!(response);

-            let mut max_keys = max_keys.map(|mk| mk.get());
-            while let Some(entry) = response.next().await {
-                let entry = entry?;
+                let mut max_keys = max_keys.map(|mk| mk.get());
+                let next_item = tokio::select! {
+                    op = response.next() => Ok(op),
+                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+                }?;
+                let Some(entry) = next_item else {
+                    // The list is complete, so yield it.
+                    break;
+                };
+
+                let mut res = Listing::default();
+                let entry = match entry {
+                    Ok(entry) => entry,
+                    Err(e) => {
+                        // The error is potentially retryable, so we must rewind the loop after yielding.
+                        yield Err(e);
+                        continue;
+                    }
+                };
+                next_marker = entry.continuation();
                let prefix_iter = entry
                    .blobs
                    .prefixes()
@@ -333,19 +361,19 @@ impl RemoteStorage for AzureBlobStorage {
                        assert!(mk > 0);
                        mk -= 1;
                        if mk == 0 {
-                            return Ok(res); // limit reached
+                            yield Ok(res); // limit reached
+                            break 'outer;
                        }
                        max_keys = Some(mk);
                    }
                }
+                yield Ok(res);
+
+                // We are done here
+                if next_marker.is_none() {
+                    break;
+                }
            }
-
-            Ok(res)
-        };
-
-        tokio::select! {
-            res = op => res,
-            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
        }
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -26,7 +26,7 @@ use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
-use futures::stream::Stream;
+use futures::{stream::Stream, StreamExt};
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -160,13 +160,18 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
-    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`].
+    ///
+    /// The stream is guaranteed to return at least one element, even in the case of errors
+    /// (in that case it's an `Err()`), or an empty `Listing`.
+    ///
+    /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error.
+    /// The `next` function can be retried, and maybe in a future retry, there will be success.
    ///
    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
    /// from the absolute root of the bucket.
    ///
-    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+    /// `mode` configures whether to use a delimiter.  Without a delimiter, all keys
    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
    /// returned in `keys` ().
@@ -175,13 +180,32 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
    ///
+    /// [`ListObjectsV2`]: <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>
+    /// [`is_permanent`]: DownloadError::is_permanent
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>>;
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-        _mode: ListingMode,
+        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError>;
+    ) -> Result<Listing, DownloadError> {
+        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
+        let mut combined = stream.next().await.expect("At least one item required")?;
+        while let Some(list) = stream.next().await {
+            let list = list?;
+            combined.keys.extend_from_slice(&list.keys);
+            combined.prefixes.extend_from_slice(&list.prefixes);
+        }
+        Ok(combined)
+    }

    /// Streams the local file contents into remote into the remote storage entry.
    ///
@@ -288,8 +312,8 @@ impl Debug for Download {

 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
-#[derive(Clone)]
 // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
+#[derive(Clone)]
 pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
@@ -298,13 +322,14 @@ pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
 }

 impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
+    // See [`RemoteStorage::list`].
    pub async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> anyhow::Result<Listing, DownloadError> {
+    ) -> Result<Listing, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
            Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
@@ -313,6 +338,23 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // See [`RemoteStorage::list_streaming`].
+    pub fn list_streaming<'a>(
+        &'a self,
+        prefix: Option<&'a RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &'a CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
+        match self {
+            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
+            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
+            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
+            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
+        }
+    }
+
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -443,7 +485,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 }

 impl GenericRemoteStorage {
-    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+    pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs { local_path: path } => {
@@ -458,7 +500,7 @@ impl GenericRemoteStorage {
                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
            }
            RemoteStorageKind::AzureContainer(azure_config) => {
                let storage_account = azure_config
@@ -504,6 +546,16 @@ impl GenericRemoteStorage {
            None => self.download(from, cancel).await,
        }
    }
+
+    /// The name of the bucket/container/etc.
+    pub fn bucket_name(&self) -> Option<&str> {
+        match self {
+            Self::LocalFs(_s) => None,
+            Self::AwsS3(s) => Some(s.bucket_name()),
+            Self::AzureBlob(s) => Some(s.container_name()),
+            Self::Unreliable(_s) => None,
+        }
+    }
 }

 /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -331,6 +331,17 @@ impl LocalFs {
 }

 impl RemoteStorage for LocalFs {
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        let listing = self.list(prefix, mode, max_keys, cancel);
+        futures::stream::once(listing)
+    }
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,16 +16,10 @@ use std::{

 use anyhow::{anyhow, Context as _};
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::CredentialsProviderChain,
-    profile::ProfileFileCredentialsProvider,
-    provider_config::ProviderConfig,
+    default_provider::credentials::DefaultCredentialsChain,
    retry::{RetryConfigBuilder, RetryMode},
-    web_identity_token::WebIdentityTokenCredentialsProvider,
    BehaviorVersion,
 };
-use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
@@ -76,40 +70,27 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            remote_storage_config.bucket_name
        );

-        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
+        let region = Region::new(remote_storage_config.bucket_region.clone());
+        let region_opt = Some(region.clone());

-        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
-        let credentials_provider = {
-            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-            CredentialsProviderChain::first_try(
-                "env",
-                EnvironmentVariableCredentialsProvider::new(),
-            )
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else(
-                "token",
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses imds v2
-            .or_else("imds", ImdsCredentialsProvider::builder().build())
-        };
+        // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
+        // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
+        // Incomplete list of auth methods used by this:
+        // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        // * "AWS_PROFILE" / `aws sso login --profile <profile>`
+        // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+        // * http (ECS/EKS) container credentials
+        // * imds v2
+        let credentials_provider = DefaultCredentialsChain::builder()
+            .region(region)
+            .build()
+            .await;

        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -118,9 +99,9 @@ impl S3Bucket {
            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
            BehaviorVersion::v2023_11_09(),
        )
-        .region(region)
+        .region(region_opt)
        .identity_cache(IdentityCache::lazy().build())
-        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+        .credentials_provider(credentials_provider)
        .sleep_impl(SharedAsyncSleep::from(sleep_impl));

        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -405,6 +386,10 @@ impl S3Bucket {
        }
        Ok(())
    }
+
+    pub fn bucket_name(&self) -> &str {
+        &self.bucket_name
+    }
 }

 pin_project_lite::pin_project! {
@@ -482,17 +467,16 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
 }

 impl RemoteStorage for S3Bucket {
-    async fn list(
+    fn list_streaming(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError> {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
        let kind = RequestKind::List;
        // s3 sdk wants i32
        let mut max_keys = max_keys.map(|mk| mk.get() as i32);
-        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -504,89 +488,99 @@ impl RemoteStorage for S3Bucket {
                })
            });

-        let _permit = self.permit(kind, cancel).await?;
+        async_stream::stream! {
+            let _permit = self.permit(kind, cancel).await?;

-        let mut continuation_token = None;
+            let mut continuation_token = None;
+            'outer: loop {
+                let started_at = start_measuring_requests(kind);

-        loop {
-            let started_at = start_measuring_requests(kind);
+                // min of two Options, returning Some if one is value and another is
+                // None (None is smaller than anything, so plain min doesn't work).
+                let request_max_keys = self
+                    .max_keys_per_list_response
+                    .into_iter()
+                    .chain(max_keys.into_iter())
+                    .min();
+                let mut request = self
+                    .client
+                    .list_objects_v2()
+                    .bucket(self.bucket_name.clone())
+                    .set_prefix(list_prefix.clone())
+                    .set_continuation_token(continuation_token.clone())
+                    .set_max_keys(request_max_keys);

-            // min of two Options, returning Some if one is value and another is
-            // None (None is smaller than anything, so plain min doesn't work).
-            let request_max_keys = self
-                .max_keys_per_list_response
-                .into_iter()
-                .chain(max_keys.into_iter())
-                .min();
-            let mut request = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(list_prefix.clone())
-                .set_continuation_token(continuation_token)
-                .set_max_keys(request_max_keys);
-
-            if let ListingMode::WithDelimiter = mode {
-                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-            }
-
-            let request = request.send();
-
-            let response = tokio::select! {
-                res = request => res,
-                _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
-                _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
-            };
-
-            let response = response
-                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other);
-
-            let started_at = ScopeGuard::into_inner(started_at);
-
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
-
-            let keys = response.contents();
-            let empty = Vec::new();
-            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
-
-            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
-
-            for object in keys {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                result.keys.push(remote_path);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(result); // limit reached
-                    }
-                    max_keys = Some(mk);
+                if let ListingMode::WithDelimiter = mode {
+                    request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
                }
+
+                let request = request.send();
+
+                let response = tokio::select! {
+                    res = request => Ok(res),
+                    _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
+                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+                }?;
+
+                let response = response
+                    .context("Failed to list S3 prefixes")
+                    .map_err(DownloadError::Other);
+
+                let started_at = ScopeGuard::into_inner(started_at);
+
+                crate::metrics::BUCKET_METRICS
+                    .req_seconds
+                    .observe_elapsed(kind, &response, started_at);
+
+                let response = match response {
+                    Ok(response) => response,
+                    Err(e) => {
+                        // The error is potentially retryable, so we must rewind the loop after yielding.
+                        yield Err(e);
+                        continue 'outer;
+                    },
+                };
+
+                let keys = response.contents();
+                let prefixes = response.common_prefixes.as_deref().unwrap_or_default();
+
+                tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+                let mut result = Listing::default();
+
+                for object in keys {
+                    let object_path = object.key().expect("response does not contain a key");
+                    let remote_path = self.s3_object_to_relative_path(object_path);
+                    result.keys.push(remote_path);
+                    if let Some(mut mk) = max_keys {
+                        assert!(mk > 0);
+                        mk -= 1;
+                        if mk == 0 {
+                            // limit reached
+                            yield Ok(result);
+                            break 'outer;
+                        }
+                        max_keys = Some(mk);
+                    }
+                }
+
+                // S3 gives us prefixes like "foo/", we return them like "foo"
+                result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                    Some(
+                        self.s3_object_to_relative_path(
+                            o.prefix()?
+                                .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                        ),
+                    )
+                }));
+
+                yield Ok(result);
+
+                continuation_token = match response.next_continuation_token {
+                    Some(new_token) => Some(new_token),
+                    None => break,
+                };
            }
-
-            // S3 gives us prefixes like "foo/", we return them like "foo"
-            result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                Some(
-                    self.s3_object_to_relative_path(
-                        o.prefix()?
-                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
-                    ),
-                )
-            }));
-
-            continuation_token = match response.next_continuation_token {
-                Some(new_token) => Some(new_token),
-                None => break,
-            };
        }
-
-        Ok(result)
    }

    async fn upload(
@@ -1041,8 +1035,8 @@ mod tests {

    use crate::{RemotePath, S3Bucket, S3Config};

-    #[test]
-    fn relative_path() {
+    #[tokio::test]
+    async fn relative_path() {
        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
@@ -1085,8 +1079,9 @@ mod tests {
                max_keys_per_list_response: Some(5),
                upload_storage_class: None,
            };
-            let storage =
-                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
+            let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
+                .await
+                .expect("remote storage init");
            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                let result = storage.relative_path_to_s3_object(test_path);
                let expected = expected_outputs[prefix_idx][test_path_idx];
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,6 +3,7 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
+use futures::StreamExt;
 use std::collections::HashMap;
 use std::num::NonZeroU32;
 use std::sync::Mutex;
@@ -107,6 +108,23 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        async_stream::stream! {
+            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+                .map_err(DownloadError::Other)?;
+            let mut stream = self.inner
+                .list_streaming(prefix, mode, max_keys, cancel);
+            while let Some(item) = stream.next().await {
+                yield item;
+            }
+        }
+    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data(
    let mut upload_tasks = JoinSet::new();
    let cancel = CancellationToken::new();

-    for i in 1..upload_tasks_count + 1 {
+    for i in 1..=upload_tasks_count {
        let task_client = Arc::clone(client);
        let cancel = cancel.clone();

--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
+use futures::StreamExt;
 use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
@@ -29,10 +30,10 @@ use super::{
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
-/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// since current default AWS S3 pagination limit is 1000.
-/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response.
+/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// as the current default AWS S3 pagination limit is 1000.
+/// (see <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>).
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
@@ -87,6 +88,41 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
    );

+    // list_streaming
+
+    let prefix_with_slash = base_prefix.add_trailing_slash();
+    let mut nested_remote_prefixes_st = test_client.list_streaming(
+        Some(&prefix_with_slash),
+        ListingMode::WithDelimiter,
+        None,
+        &cancel,
+    );
+    let mut nested_remote_prefixes_combined = HashSet::new();
+    let mut segments = 0;
+    let mut segment_max_size = 0;
+    while let Some(st) = nested_remote_prefixes_st.next().await {
+        let st = st?;
+        segment_max_size = segment_max_size.max(st.prefixes.len());
+        nested_remote_prefixes_combined.extend(st.prefixes.into_iter());
+        segments += 1;
+    }
+    assert!(segments > 1, "less than 2 segments: {segments}");
+    assert!(
+        segment_max_size * 2 <= nested_remote_prefixes_combined.len(),
+        "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}",
+        nested_remote_prefixes_combined.len()
+    );
+    let remote_only_prefixes = nested_remote_prefixes_combined
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes_combined)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
    Ok(())
 }

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -31,6 +31,7 @@ struct EnabledAzure {
 impl EnabledAzure {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_azure_client(max_keys_in_list_response)
+            .await
            .context("Azure client creation")
            .expect("Azure client creation failed");

@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-fn create_azure_client(
+async fn create_azure_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -221,6 +222,8 @@ fn create_azure_client(
        timeout: Duration::from_secs(120),
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
    ))
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -197,6 +197,7 @@ struct EnabledS3 {
 impl EnabledS3 {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_s3_client(max_keys_in_list_response)
+            .await
            .context("S3 client creation")
            .expect("S3 client creation failed");

@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-fn create_s3_client(
+async fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -385,7 +386,9 @@ fn create_s3_client(
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
    ))
 }

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -20,7 +20,6 @@ bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
-heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -33,6 +33,10 @@ pub enum Scope {
    GenerationsApi,
    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,
+
+    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
+    /// of a tenant & post scrub results.
+    Scrubber,
 }

 /// JWT payload. See docs/authentication.md for the format
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -0,0 +1,114 @@
+use std::{
+    fmt::Display,
+    time::{Duration, Instant},
+};
+
+use metrics::IntCounter;
+
+/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
+/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// to mitigate the log spam from repeated failures.
+pub struct CircuitBreaker {
+    /// An identifier that enables us to log useful errors when a circuit is broken
+    name: String,
+
+    /// Consecutive failures since last success
+    fail_count: usize,
+
+    /// How many consecutive failures before we break the circuit
+    fail_threshold: usize,
+
+    /// If circuit is broken, when was it broken?
+    broken_at: Option<Instant>,
+
+    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
+    /// circuits stay broken forever, or until success() is called.
+    reset_period: Option<Duration>,
+
+    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
+    /// to permit something to keep running even if it would otherwise have tripped it.
+    short_circuit: bool,
+}
+
+impl CircuitBreaker {
+    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
+        Self {
+            name,
+            fail_count: 0,
+            fail_threshold,
+            broken_at: None,
+            reset_period,
+            short_circuit: false,
+        }
+    }
+
+    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
+    pub fn short_circuit() -> Self {
+        Self {
+            name: String::new(),
+            fail_threshold: 0,
+            fail_count: 0,
+            broken_at: None,
+            reset_period: None,
+            short_circuit: true,
+        }
+    }
+
+    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        if self.short_circuit {
+            return;
+        }
+
+        self.fail_count += 1;
+        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
+            self.break_circuit(metric, error);
+        }
+    }
+
+    /// Call this after successfully executing an operation
+    pub fn success(&mut self, metric: &IntCounter) {
+        self.fail_count = 0;
+        if let Some(broken_at) = &self.broken_at {
+            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
+                humantime::format_duration(broken_at.elapsed()));
+            self.broken_at = None;
+            metric.inc();
+        }
+    }
+
+    /// Call this before attempting an operation, and skip the operation if we are currently broken.
+    pub fn is_broken(&mut self) -> bool {
+        if self.short_circuit {
+            return false;
+        }
+
+        if let Some(broken_at) = self.broken_at {
+            match self.reset_period {
+                Some(reset_period) if broken_at.elapsed() > reset_period => {
+                    self.reset_circuit();
+                    false
+                }
+                _ => true,
+            }
+        } else {
+            false
+        }
+    }
+
+    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        self.broken_at = Some(Instant::now());
+        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
+        metric.inc();
+    }
+
+    fn reset_circuit(&mut self) {
+        self.broken_at = None;
+        self.fail_count = 0;
+    }
+}
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -1,196 +0,0 @@
-//! A heapless buffer for events of sorts.
-
-use std::ops;
-
-use heapless::HistoryBuffer;
-
-#[derive(Debug, Clone)]
-pub struct HistoryBufferWithDropCounter<T, const L: usize> {
-    buffer: HistoryBuffer<T, L>,
-    drop_count: u64,
-}
-
-impl<T, const L: usize> HistoryBufferWithDropCounter<T, L> {
-    pub fn write(&mut self, data: T) {
-        let len_before = self.buffer.len();
-        self.buffer.write(data);
-        let len_after = self.buffer.len();
-        self.drop_count += u64::from(len_before == len_after);
-    }
-    pub fn drop_count(&self) -> u64 {
-        self.drop_count
-    }
-    pub fn map<U, F: Fn(&T) -> U>(&self, f: F) -> HistoryBufferWithDropCounter<U, L> {
-        let mut buffer = HistoryBuffer::new();
-        buffer.extend(self.buffer.oldest_ordered().map(f));
-        HistoryBufferWithDropCounter::<U, L> {
-            buffer,
-            drop_count: self.drop_count,
-        }
-    }
-}
-
-impl<T, const L: usize> Default for HistoryBufferWithDropCounter<T, L> {
-    fn default() -> Self {
-        Self {
-            buffer: HistoryBuffer::default(),
-            drop_count: 0,
-        }
-    }
-}
-
-impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
-    type Target = HistoryBuffer<T, L>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.buffer
-    }
-}
-
-#[derive(serde::Serialize, serde::Deserialize)]
-struct SerdeRepr<T> {
-    buffer: Vec<T>,
-    buffer_size: usize,
-    drop_count: u64,
-}
-
-impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter<T, L>> for SerdeRepr<T>
-where
-    T: Clone + serde::Serialize,
-{
-    fn from(value: &'a HistoryBufferWithDropCounter<T, L>) -> Self {
-        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
-        SerdeRepr {
-            buffer: buffer.iter().cloned().collect(),
-            buffer_size: L,
-            drop_count: *drop_count,
-        }
-    }
-}
-
-impl<T, const L: usize> serde::Serialize for HistoryBufferWithDropCounter<T, L>
-where
-    T: Clone + serde::Serialize,
-{
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        SerdeRepr::from(self).serialize(serializer)
-    }
-}
-
-impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
-where
-    T: Clone + serde::Deserialize<'de>,
-{
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let SerdeRepr {
-            buffer: des_buffer,
-            drop_count,
-            buffer_size,
-        } = SerdeRepr::<T>::deserialize(deserializer)?;
-        if buffer_size != L {
-            use serde::de::Error;
-            return Err(D::Error::custom(format!(
-                "invalid buffer_size, expecting {L} got {buffer_size}"
-            )));
-        }
-        let mut buffer = HistoryBuffer::new();
-        buffer.extend(des_buffer);
-        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::HistoryBufferWithDropCounter;
-
-    #[test]
-    fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
-        b.write(1);
-        b.write(2);
-        b.write(3);
-        assert!(b.iter().any(|e| *e == 2));
-        assert!(b.iter().any(|e| *e == 3));
-        assert!(!b.iter().any(|e| *e == 1));
-
-        // round-trip serde
-        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
-            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
-        assert_eq!(
-            round_tripped.iter().cloned().collect::<Vec<_>>(),
-            b.iter().cloned().collect::<Vec<_>>()
-        );
-    }
-
-    #[test]
-    fn test_drop_count_works() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
-        b.write(1);
-        assert_eq!(b.drop_count(), 0);
-        b.write(2);
-        assert_eq!(b.drop_count(), 0);
-        b.write(3);
-        assert_eq!(b.drop_count(), 1);
-        b.write(4);
-        assert_eq!(b.drop_count(), 2);
-    }
-
-    #[test]
-    fn test_clone_works() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
-        b.write(1);
-        b.write(2);
-        b.write(3);
-        assert_eq!(b.drop_count(), 1);
-        let mut c = b.clone();
-        assert_eq!(c.drop_count(), 1);
-        assert!(c.iter().any(|e| *e == 2));
-        assert!(c.iter().any(|e| *e == 3));
-        assert!(!c.iter().any(|e| *e == 1));
-
-        c.write(4);
-        assert!(c.iter().any(|e| *e == 4));
-        assert!(!b.iter().any(|e| *e == 4));
-    }
-
-    #[test]
-    fn test_map() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
-
-        b.write(1);
-        assert_eq!(b.drop_count(), 0);
-        {
-            let c = b.map(|i| i + 10);
-            assert_eq!(c.oldest_ordered().cloned().collect::<Vec<_>>(), vec![11]);
-            assert_eq!(c.drop_count(), 0);
-        }
-
-        b.write(2);
-        assert_eq!(b.drop_count(), 0);
-        {
-            let c = b.map(|i| i + 10);
-            assert_eq!(
-                c.oldest_ordered().cloned().collect::<Vec<_>>(),
-                vec![11, 12]
-            );
-            assert_eq!(c.drop_count(), 0);
-        }
-
-        b.write(3);
-        assert_eq!(b.drop_count(), 1);
-        {
-            let c = b.map(|i| i + 10);
-            assert_eq!(
-                c.oldest_ordered().cloned().collect::<Vec<_>>(),
-                vec![12, 13]
-            );
-            assert_eq!(c.drop_count(), 1);
-        }
-    }
-}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-/// With all the drawbacks of procmacros, brings no difference implementation-wise,
-/// and little code reduction compared to the existing approach.
+///   With all the drawbacks of procmacros, brings no difference implementation-wise,
+///   and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-/// implemented for [`RouterBuilder`].
-/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+///   implemented for [`RouterBuilder`].
+///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-/// later, in a post-response middleware.
-/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-/// tries to achive with its `.instrument` used in the current approach.
+///   later, in a post-response middleware.
+///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+///   tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,17 +302,6 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

-/// Neon Connection Id identifies long-lived connections (for example a pagestream
-/// connection with the page_service). Is used for better logging and tracing
-///
-/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
-/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-/// See [`Id`] for alternative ways to serialize it.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
-pub struct ConnectionId(Id);
-
-id_newtype!(ConnectionId);
-
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -59,8 +59,6 @@ pub mod signals;

 pub mod fs_ext;

-pub mod history_buffer;
-
 pub mod measured_stream;

 pub mod serde_percent;
@@ -98,6 +96,8 @@ pub mod poison;

 pub mod toml_edit_ext;

+pub mod circuit_breaker;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -49,6 +49,7 @@ pub struct TenantShardId {

 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
+    pub const MIN: Self = Self(0);

    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;

 use bytes::Bytes;
+use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -418,6 +419,23 @@ impl Client {
        }
    }

+    pub async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::PUT, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;

 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+pub trait CompactionLayer<K: CompactionKey> {
    fn key_range(&self) -> &Range<K>;
    fn lsn_range(&self) -> &Range<Lsn>;

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
                .get("remote_storage")
                .expect("need remote_storage");
            let config = RemoteStorageConfig::from_toml(toml_item)?;
-            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
            let cancel = CancellationToken::new();
            storage
                .unwrap()
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
-            format!(
-                "JWT scope '{:?}' is ineligible for Pageserver auth",
-                claims.scope
-            )
-            .into(),
-        )),
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
+            Err(AuthError(
+                format!(
+                    "JWT scope '{:?}' is ineligible for Pageserver auth",
+                    claims.scope
+                )
+                .into(),
+            ))
+        }
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,30 +2,35 @@

 //! Main entry point for the Page Server executable.

+use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;

 use metrics::set_build_info_metric;
 use pageserver::{
-    config::{defaults::*, PageServerConf},
+    config::PageServerConf,
    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
@@ -84,18 +89,13 @@ fn main() -> anyhow::Result<()> {
        .with_context(|| format!("Error opening workdir '{workdir}'"))?;

    let cfg_file_path = workdir.join("pageserver.toml");
+    let identity_file_path = workdir.join("identity.toml");

    // Set CWD to workdir for non-daemon modes
    env::set_current_dir(&workdir)
        .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;

-    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
-        ControlFlow::Continue(conf) => conf,
-        ControlFlow::Break(()) => {
-            info!("Pageserver config init successful");
-            return Ok(());
-        }
-    };
+    let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;

    // Initialize logging.
    //
@@ -150,70 +150,55 @@ fn main() -> anyhow::Result<()> {
 }

 fn initialize_config(
+    identity_file_path: &Utf8Path,
    cfg_file_path: &Utf8Path,
-    arg_matches: clap::ArgMatches,
    workdir: &Utf8Path,
-) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
-    let init = arg_matches.get_flag("init");
-
-    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
+) -> anyhow::Result<&'static PageServerConf> {
+    // The deployment orchestrator writes out an indentity file containing the node id
+    // for all pageservers. This file is the source of truth for the node id. In order
+    // to allow for rolling back pageserver releases, the node id is also included in
+    // the pageserver config that the deployment orchestrator writes to disk for the pageserver.
+    // A rolled back version of the pageserver will get the node id from the pageserver.toml
+    // config file.
+    let identity = match std::fs::File::open(identity_file_path) {
        Ok(mut f) => {
-            if init {
-                anyhow::bail!("config file already exists: {cfg_file_path}");
+            let md = f.metadata().context("stat config file")?;
+            if !md.is_file() {
+                anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
            }
+
+            let mut s = String::new();
+            f.read_to_string(&mut s).context("read identity file")?;
+            toml_edit::de::from_str::<PageserverIdentity>(&s)?
+        }
+        Err(e) => {
+            anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
+        }
+    };
+
+    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
+        Ok(mut f) => {
            let md = f.metadata().context("stat config file")?;
            if md.is_file() {
                let mut s = String::new();
                f.read_to_string(&mut s).context("read config file")?;
-                Some(s.parse().context("parse config file toml")?)
+                s.parse().context("parse config file toml")?
            } else {
                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
            }
        }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
        Err(e) => {
            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
        }
    };

-    let mut effective_config = file_contents.unwrap_or_else(|| {
-        DEFAULT_CONFIG_FILE
-            .parse()
-            .expect("unit tests ensure this works")
-    });
-
-    // Patch with overrides from the command line
-    if let Some(values) = arg_matches.get_many::<String>("config-override") {
-        for option_line in values {
-            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
-                format!("Option '{option_line}' could not be parsed as a toml document")
-            })?;
-
-            for (key, item) in doc.iter() {
-                effective_config.insert(key, item.clone());
-            }
-        }
-    }
-
-    debug!("Resulting toml: {effective_config}");
+    debug!("Using pageserver toml: {config}");

    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
+    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
        .context("Failed to parse pageserver configuration")?;

-    if init {
-        info!("Writing pageserver config to '{cfg_file_path}'");
-
-        std::fs::write(cfg_file_path, effective_config.to_string())
-            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
-        info!("Config successfully written to '{cfg_file_path}'")
-    }
-
-    Ok(if init {
-        ControlFlow::Break(())
-    } else {
-        ControlFlow::Continue(Box::leak(Box::new(conf)))
-    })
+    Ok(Box::leak(Box::new(conf)))
 }

 struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -305,6 +290,7 @@ fn start_pageserver(
    // Create and lock PID file. This ensures that there cannot be more than one
    // pageserver process running at the same time.
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    info!("Claiming pid file at {lock_file_path:?}...");
    let lock_file =
        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
    info!("Claimed pid file at {lock_file_path:?}");
@@ -385,7 +371,7 @@ fn start_pageserver(
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    // Set up remote storage client
-    let remote_storage = create_remote_storage_client(conf)?;
+    let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;

    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -430,8 +416,10 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
+    let background_purges = mgr::BackgroundPurges::default();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
+        background_purges.clone(),
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
@@ -523,7 +511,7 @@ fn start_pageserver(
        }
    });

-    let secondary_controller = secondary::spawn_tasks(
+    let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks(
        tenant_manager.clone(),
        remote_storage.clone(),
        background_jobs_barrier.clone(),
@@ -536,18 +524,19 @@ fn start_pageserver(
    // been configured.
    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();

-    launch_disk_usage_global_eviction_task(
+    let disk_usage_eviction_task = launch_disk_usage_global_eviction_task(
        conf,
        remote_storage.clone(),
        disk_usage_eviction_state.clone(),
        tenant_manager.clone(),
        background_jobs_barrier.clone(),
-    )?;
+    );

    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
-    {
-        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
+    let http_endpoint_listener = {
+        let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
+        let cancel = CancellationToken::new();

        let router_state = Arc::new(
            http::routes::State::new(
@@ -568,78 +557,44 @@ fn start_pageserver(
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
-            .with_graceful_shutdown(task_mgr::shutdown_watcher());
+            .with_graceful_shutdown({
+                let cancel = cancel.clone();
+                async move { cancel.clone().cancelled().await }
+            });

-        task_mgr::spawn(
-            MGMT_REQUEST_RUNTIME.handle(),
-            TaskKind::HttpEndpointListener,
-            None,
-            None,
+        let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
            "http endpoint listener",
-            true,
-            async {
-                server.await?;
-                Ok(())
-            },
-        );
-    }
+            server,
+        ));
+        HttpEndpointListener(CancellableTask { task, cancel })
+    };

-    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-        let metrics_ctx = RequestContext::todo_child(
-            TaskKind::MetricsCollection,
-            // This task itself shouldn't download anything.
-            // The actual size calculation does need downloads, and
-            // creates a child context with the right DownloadBehavior.
-            DownloadBehavior::Error,
-        );
+    let consumption_metrics_tasks = {
+        let cancel = shutdown_pageserver.child_token();
+        let task = crate::BACKGROUND_RUNTIME.spawn({
+            let tenant_manager = tenant_manager.clone();
+            let cancel = cancel.clone();
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return; },
+                    _ = background_jobs_barrier.wait() => {}
+                };

-        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
-
-        task_mgr::spawn(
-            crate::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MetricsCollection,
-            None,
-            None,
-            "consumption metrics collection",
-            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
-
-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
-
-                    pageserver::consumption_metrics::collect_metrics(
-                        tenant_manager,
-                        metric_collection_endpoint,
-                        &conf.metric_collection_bucket,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        local_disk_storage,
-                        cancel,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                }
-            },
-        );
-    }
+                pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await;
+            }
+        });
+        ConsumptionMetricsTasks(CancellableTask { task, cancel })
+    };

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    {
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
        let libpq_ctx = RequestContext::todo_child(
            TaskKind::LibpqEndpointListener,
            // listener task shouldn't need to download anything. (We will
@@ -648,29 +603,20 @@ fn start_pageserver(
            // accept connections.)
            DownloadBehavior::Error,
        );
-        task_mgr::spawn(
-            COMPUTE_REQUEST_RUNTIME.handle(),
-            TaskKind::LibpqEndpointListener,
-            None,
-            None,
-            "libpq endpoint listener",
-            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    page_service::libpq_listener_main(
-                        tenant_manager,
-                        pg_auth,
-                        pageserver_listener,
-                        conf.pg_auth_type,
-                        libpq_ctx,
-                        task_mgr::shutdown_token(),
-                    )
-                    .await
-                }
-            },
-        );
-    }
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -696,13 +642,24 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
+            pageserver::shutdown_pageserver(
+                http_endpoint_listener,
+                libpq_listener,
+                consumption_metrics_tasks,
+                disk_usage_eviction_task,
+                &tenant_manager,
+                background_purges,
+                deletion_queue.clone(),
+                secondary_controller_tasks,
+                0,
+            )
+            .await;
            unreachable!()
        })
    }
 }

-fn create_remote_storage_client(
+async fn create_remote_storage_client(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
    let config = if let Some(config) = &conf.remote_storage_config {
@@ -712,7 +669,7 @@ fn create_remote_storage_client(
    };

    // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+    let mut remote_storage = GenericRemoteStorage::from_config(config).await?;

    // If `test_remote_failures` is non-zero, wrap the client with a
    // wrapper that simulates failures.
@@ -735,28 +692,12 @@ fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(version())
-        .arg(
-            Arg::new("init")
-                .long("init")
-                .action(ArgAction::SetTrue)
-                .help("Initialize pageserver with all given config overrides"),
-        )
        .arg(
            Arg::new("workdir")
                .short('D')
                .long("workdir")
                .help("Working directory for the pageserver"),
        )
-        // See `settings.md` for more details on the extra configuration patameters pageserver can process
-        .arg(
-            Arg::new("config-override")
-                .long("config-override")
-                .short('c')
-                .num_args(1)
-                .action(ArgAction::Append)
-                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
-                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
-        )
        .arg(
            Arg::new("enabled-features")
                .long("enabled-features")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,12 +7,11 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use serde;
 use serde::de::IntoDeserializer;
+use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::ConnectionId;
 use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
@@ -69,7 +68,6 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -124,7 +122,6 @@ pub mod defaults {
 #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'

 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
-#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -239,7 +236,6 @@ pub struct PageServerConf {
    // How often to collect metrics and send them to the metrics endpoint.
    pub metric_collection_interval: Duration,
    // How often to send unchanged cached metrics to the metrics endpoint.
-    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,
@@ -371,7 +367,6 @@ struct PageServerConfigBuilder {
    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,

    metric_collection_interval: BuilderValue<Duration>,
-    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -411,6 +406,13 @@ struct PageServerConfigBuilder {
 }

 impl PageServerConfigBuilder {
+    fn new(node_id: NodeId) -> Self {
+        let mut this = Self::default();
+        this.id(node_id);
+
+        this
+    }
+
    #[inline(always)]
    fn default_values() -> Self {
        use self::BuilderValue::*;
@@ -455,10 +457,6 @@ impl PageServerConfigBuilder {
                DEFAULT_METRIC_COLLECTION_INTERVAL,
            )
            .expect("cannot parse default metric collection interval")),
-            cached_metric_collection_interval: Set(humantime::parse_duration(
-                DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default cached_metric_collection_interval")),
            synthetic_size_calculation_interval: Set(humantime::parse_duration(
                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
            )
@@ -590,14 +588,6 @@ impl PageServerConfigBuilder {
        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
    }

-    pub fn cached_metric_collection_interval(
-        &mut self,
-        cached_metric_collection_interval: Duration,
-    ) {
-        self.cached_metric_collection_interval =
-            BuilderValue::Set(cached_metric_collection_interval)
-    }
-
    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }
@@ -731,7 +721,6 @@ impl PageServerConfigBuilder {
                broker_keepalive_interval,
                log_format,
                metric_collection_interval,
-                cached_metric_collection_interval,
                metric_collection_endpoint,
                metric_collection_bucket,
                synthetic_size_calculation_interval,
@@ -870,22 +859,6 @@ impl PageServerConf {
        )
    }

-    pub fn traces_path(&self) -> Utf8PathBuf {
-        self.workdir.join("traces")
-    }
-
-    pub fn trace_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        connection_id: &ConnectionId,
-    ) -> Utf8PathBuf {
-        self.traces_path()
-            .join(tenant_shard_id.to_string())
-            .join(timeline_id.to_string())
-            .join(connection_id.to_string())
-    }
-
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -915,8 +888,12 @@ impl PageServerConf {
    /// validating the input and failing on errors.
    ///
    /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::default();
+    pub fn parse_and_validate(
+        node_id: NodeId,
+        toml: &Document,
+        workdir: &Utf8Path,
+    ) -> anyhow::Result<Self> {
+        let mut builder = PageServerConfigBuilder::new(node_id);
        builder.workdir(workdir.to_owned());

        let mut t_conf = TenantConfOpt::default();
@@ -947,7 +924,8 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
-                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
+                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
+                            // Logging is not set up yet, so we can't do it.
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -964,7 +942,6 @@ impl PageServerConf {
                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
                }),
                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
-                "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
                "metric_collection_endpoint" => {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
@@ -1097,7 +1074,6 @@ impl PageServerConf {
            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
            ),
            metric_collection_interval: Duration::from_secs(60),
-            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1126,6 +1102,12 @@ impl PageServerConf {
    }
 }

+#[derive(Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct PageserverIdentity {
+    pub id: NodeId,
+}
+
 // Helper functions to parse a toml Item

 fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
@@ -1276,7 +1258,6 @@ initial_superuser_name = 'zzzz'
 id = 10

 metric_collection_interval = '222 s'
-cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

@@ -1296,7 +1277,7 @@ background_task_maximum_delay = '334 s'
        );
        let toml = config_string.parse()?;

-        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
@@ -1332,9 +1313,6 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: humantime::parse_duration(
                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
                )?,
-                cached_metric_collection_interval: humantime::parse_duration(
-                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
-                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1381,7 +1359,7 @@ background_task_maximum_delay = '334 s'
        );
        let toml = config_string.parse()?;

-        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
@@ -1413,7 +1391,6 @@ background_task_maximum_delay = '334 s'
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
                metric_collection_interval: Duration::from_secs(222),
-                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
@@ -1472,12 +1449,13 @@ broker_endpoint = '{broker_endpoint}'

            let toml = config_string.parse()?;

-            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| {
-                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                })
-                .remote_storage_config
-                .expect("Should have remote storage config for the local FS");
+            let parsed_remote_storage_config =
+                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+                    .unwrap_or_else(|e| {
+                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                    })
+                    .remote_storage_config
+                    .expect("Should have remote storage config for the local FS");

            assert_eq!(
                parsed_remote_storage_config,
@@ -1533,12 +1511,13 @@ broker_endpoint = '{broker_endpoint}'

            let toml = config_string.parse()?;

-            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| {
-                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                })
-                .remote_storage_config
-                .expect("Should have remote storage config for S3");
+            let parsed_remote_storage_config =
+                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+                    .unwrap_or_else(|e| {
+                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                    })
+                    .remote_storage_config
+                    .expect("Should have remote storage config for S3");

            assert_eq!(
                parsed_remote_storage_config,
@@ -1560,34 +1539,6 @@ broker_endpoint = '{broker_endpoint}'
        Ok(())
    }

-    #[test]
-    fn parse_tenant_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let broker_endpoint = "http://127.0.0.1:7777";
-        let trace_read_requests = true;
-
-        let config_string = format!(
-            r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-[tenant_config]
-trace_read_requests = {trace_read_requests}"#,
-        );
-
-        let toml = config_string.parse()?;
-
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
-        assert_eq!(
-            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
-            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
-        );
-
-        Ok(())
-    }
-
    #[test]
    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"
@@ -1645,7 +1596,7 @@ threshold = "20m"
 "#,
        );
        let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;

        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
        assert_eq!(
@@ -1661,7 +1612,11 @@ threshold = "20m"
                .evictions_low_residence_duration_metric_threshold,
            Duration::from_secs(20 * 60)
        );
-        assert_eq!(conf.id, NodeId(222));
+
+        // Assert that the node id provided by the indentity file (threaded
+        // through the call to [`PageServerConf::parse_and_validate`] is
+        // used.
+        assert_eq!(conf.id, NodeId(333));
        assert_eq!(
            conf.disk_usage_based_eviction,
            Some(DiskUsageEvictionTaskConfig {
@@ -1670,7 +1625,7 @@ threshold = "20m"
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
+                eviction_order: Default::default(),
            })
        );

@@ -1706,7 +1661,7 @@ threshold = "20m"
 "#,
        );
        let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
+        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();

        match &conf.default_tenant_conf.eviction_policy {
            EvictionPolicy::OnlyImitiate(t) => {
@@ -1725,7 +1680,7 @@ threshold = "20m"
 remote_storage = {}
        "#;
        let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(&doc, &workdir)
+        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
        assert!(format!("{err}").contains("remote_storage"), "{err}");
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,5 +1,6 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
+use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::size::CalculateSyntheticSizeError;
@@ -39,56 +40,74 @@ type RawMetric = (MetricsKey, (EventType, u64));
 /// for deduplication, but that is no longer needed.
 type Cache = HashMap<MetricsKey, (EventType, u64)>;

+pub async fn run(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    cancel: CancellationToken,
+) {
+    let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else {
+        return;
+    };
+
+    let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
+
+    let metrics_ctx = RequestContext::todo_child(
+        TaskKind::MetricsCollection,
+        // This task itself shouldn't download anything.
+        // The actual size calculation does need downloads, and
+        // creates a child context with the right DownloadBehavior.
+        DownloadBehavior::Error,
+    );
+    let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        "consumption metrics collection",
+        collect_metrics(
+            tenant_manager.clone(),
+            metric_collection_endpoint,
+            &conf.metric_collection_bucket,
+            conf.metric_collection_interval,
+            conf.id,
+            local_disk_storage,
+            cancel.clone(),
+            metrics_ctx,
+        )
+        .instrument(info_span!("metrics_collection")),
+    ));
+
+    let worker_ctx =
+        RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
+    let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        "synthetic size calculation",
+        calculate_synthetic_size_worker(
+            tenant_manager.clone(),
+            conf.synthetic_size_calculation_interval,
+            cancel.clone(),
+            worker_ctx,
+        )
+        .instrument(info_span!("synthetic_size_worker")),
+    ));
+
+    let (collect_metrics, synthetic_size_worker) =
+        futures::future::join(collect_metrics, synthetic_size_worker).await;
+    collect_metrics
+        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
+    synthetic_size_worker
+        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
+}
+
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
-pub async fn collect_metrics(
+async fn collect_metrics(
    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
-    _cached_metric_collection_interval: Duration,
-    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    if _cached_metric_collection_interval != Duration::ZERO {
-        tracing::warn!(
-            "cached_metric_collection_interval is no longer used, please set it to zero."
-        )
-    }
-
-    // spin up background worker that caclulates tenant sizes
-    let worker_ctx =
-        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::CalculateSyntheticSize,
-        None,
-        None,
-        "synthetic size calculation",
-        false,
-        {
-            let tenant_manager = tenant_manager.clone();
-            async move {
-                calculate_synthetic_size_worker(
-                    tenant_manager,
-                    synthetic_size_calculation_interval,
-                    &cancel,
-                    &worker_ctx,
-                )
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
-                Ok(())
-            }
-        },
-    );
-
    let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);

-    let cancel = task_mgr::shutdown_token();
-
    let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);

    let mut cached_metrics = tokio::select! {
@@ -103,7 +122,7 @@ pub async fn collect_metrics(
        .expect("Failed to create http client with timeout");

    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config) {
+        match GenericRemoteStorage::from_config(bucket_config).await {
            Ok(client) => Some(client),
            Err(e) => {
                // Non-fatal error: if we were given an invalid config, we will proceed
@@ -175,11 +194,9 @@ pub async fn collect_metrics(
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );

-        let res = tokio::time::timeout_at(
-            started_at + metric_collection_interval,
-            task_mgr::shutdown_token().cancelled(),
-        )
-        .await;
+        let res =
+            tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled())
+                .await;
        if res.is_ok() {
            return Ok(());
        }
@@ -279,8 +296,8 @@ async fn reschedule(
 async fn calculate_synthetic_size_worker(
    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
-    cancel: &CancellationToken,
-    ctx: &RequestContext,
+    cancel: CancellationToken,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
    scopeguard::defer! {
@@ -320,7 +337,7 @@ async fn calculate_synthetic_size_worker(
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
-            calculate_and_log(&tenant, cancel, ctx).await;
+            calculate_and_log(&tenant, &cancel, &ctx).await;
        }

        crate::tenant::tasks::warn_when_period_overrun(
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,6 +59,7 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
+//!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -828,9 +828,9 @@ mod test {
        }
    }

-    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name)?;
+        let harness = TenantHarness::create(test_name).await?;

        // We do not load() the harness: we only need its config and remote_storage

@@ -844,7 +844,9 @@ mod test {
            },
            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
-        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+        let storage = GenericRemoteStorage::from_config(&storage_config)
+            .await
+            .unwrap();

        let mock_control_plane = MockControlPlane::new();

@@ -922,7 +924,9 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_smoke() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+        let ctx = setup("deletion_queue_smoke")
+            .await
+            .expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -992,7 +996,9 @@ mod test {

    #[tokio::test]
    async fn deletion_queue_validation() -> anyhow::Result<()> {
-        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+        let ctx = setup("deletion_queue_validation")
+            .await
+            .expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -1051,7 +1057,9 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_recovery() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+        let mut ctx = setup("deletion_queue_recovery")
+            .await
+            .expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -59,13 +59,14 @@ use utils::{completion, id::TimelineId};
 use crate::{
    config::PageServerConf,
    metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, BACKGROUND_RUNTIME},
    tenant::{
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
    },
+    CancellableTask, DiskUsageEvictionTask,
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -83,17 +84,9 @@ pub struct DiskUsageEvictionTaskConfig {

 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "type", content = "args")]
 pub enum EvictionOrder {
-    /// Order the layers to be evicted by how recently they have been accessed in absolute
-    /// time.
-    ///
-    /// This strategy is unfair when some tenants grow faster than others towards the slower
-    /// growing.
-    #[default]
-    AbsoluteAccessed,
-
    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
    RelativeAccessed {
@@ -108,6 +101,14 @@ pub enum EvictionOrder {
    },
 }

+impl Default for EvictionOrder {
+    fn default() -> Self {
+        Self::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        }
+    }
+}
+
 fn default_highest_layer_count_loses_first() -> bool {
    true
 }
@@ -117,11 +118,6 @@ impl EvictionOrder {
        use EvictionOrder::*;

        match self {
-            AbsoluteAccessed => {
-                candidates.sort_unstable_by_key(|(partition, candidate)| {
-                    (*partition, candidate.last_activity_ts)
-                });
-            }
            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
                (*partition, candidate.relative_last_activity)
            }),
@@ -134,7 +130,6 @@ impl EvictionOrder {
        use EvictionOrder::*;

        match self {
-            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
            RelativeAccessed {
                highest_layer_count_loses_first,
            } => {
@@ -192,36 +187,34 @@ pub fn launch_disk_usage_global_eviction_task(
    state: Arc<State>,
    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
-) -> anyhow::Result<()> {
+) -> Option<DiskUsageEvictionTask> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
-        return Ok(());
+        return None;
    };

    info!("launching disk usage based eviction task");

-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::DiskUsageEviction,
-        None,
-        None,
+    let cancel = CancellationToken::new();
+    let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
        "disk usage based eviction",
-        false,
-        async move {
-            let cancel = task_mgr::shutdown_token();
+        {
+            let cancel = cancel.clone();
+            async move {
+                // wait until initial load is complete, because we cannot evict from loading tenants.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return anyhow::Ok(()); },
+                    _ = background_jobs_barrier.wait() => { }
+                };

-            // wait until initial load is complete, because we cannot evict from loading tenants.
-            tokio::select! {
-                _ = cancel.cancelled() => { return Ok(()); },
-                _ = background_jobs_barrier.wait() => { }
-            };
-
-            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
-            Ok(())
+                disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
+                    .await;
+                anyhow::Ok(())
+            }
        },
-    );
+    ));

-    Ok(())
+    Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
 }

 #[instrument(skip_all)]
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -377,7 +377,7 @@ paths:
              schema:
                $ref: "#/components/schemas/ConflictError"

-  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
    parameters:
      - name: tenant_id
        in: path
@@ -397,6 +397,51 @@ paths:
        "202":
          description: Tenant scheduled to load successfully

+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    put:
+      description: |
+        Either archives or unarchives the given timeline.
+        An archived timeline may not have any non-archived children.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ArchivalConfigRequest"
+      responses:
+        "200":
+          description: Timeline (un)archived successfully
+        "409":
+          description: |
+            The tenant/timeline is already being modified, perhaps by a concurrent call to this API
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
      - name: tenant_id
@@ -429,7 +474,9 @@ paths:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
            text/html:
-              description: SVG representation of the tenant and it's timelines.
+              schema:
+                type: string
+                description: SVG representation of the tenant and its timelines.
        "401":
          description: Unauthorized Error
          content:
@@ -568,7 +615,7 @@ paths:
          type: string
      - name: timeline_id
        in: path
-        ŕequired: true
+        required: true
        schema:
          type: string

@@ -774,15 +821,13 @@ components:
    TenantCreateRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
+        - $ref: '#/components/schemas/TenantLoadRequest'
        - type: object
          required:
            - new_tenant_id
          properties:
            new_tenant_id:
              type: string
-            generation:
-              type: integer
-              description: Attachment generation number.
    TenantLoadRequest:
      type: object
      properties:
@@ -846,6 +891,15 @@ components:
        warm:
          type: boolean
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
+    ArchivalConfigRequest:
+      type: object
+      required
+        - state
+      properties:
+        state:
+          description: The archival state of a timeline
+          type: string
+          enum: ["Archived", "Unarchived"]
    TenantConfig:
      type: object
      properties:
@@ -873,8 +927,6 @@ components:
          type: string
        max_lsn_wal_lag:
          type: integer
-        trace_read_requests:
-          type: boolean
        heatmap_period:
          type: string
    TenantConfigResponse:
@@ -1108,7 +1160,7 @@ components:
        reparented_timelines:
          type: array
          description: Set of reparented timeline ids
-          properties:
+          items:
            type: string
            format: hex
            description: TimelineId
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,14 +18,17 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
+use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantScanRemoteStorageResponse;
 use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
+use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
-use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
-};
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
@@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
    json_response(StatusCode::OK, ())
 }

+async fn timeline_archival_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant
+            .apply_timeline_archival_config(timeline_id, request_data.state)
+            .await
+            .context("applying archival config")
+            .map_err(ApiError::InternalServerError)?;
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_archival_config",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                state = ?request_data.state,
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1642,6 +1676,10 @@ async fn timeline_checkpoint_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
+
+    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
+    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
+
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

@@ -1658,15 +1696,17 @@ async fn timeline_checkpoint_handler(

                }
            })?;
-        timeline
-            .compact(&cancel, flags, &ctx)
-            .await
-            .map_err(|e|
-                match e {
-                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                    CompactionError::Other(e) => ApiError::InternalServerError(e)
-                }
-            )?;
+        if compact {
+            timeline
+                .compact(&cancel, flags, &ctx)
+                .await
+                .map_err(|e|
+                    match e {
+                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                        CompactionError::Other(e) => ApiError::InternalServerError(e)
+                    }
+                )?;
+        }

        if wait_until_uploaded {
            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
@@ -1721,7 +1761,9 @@ async fn timeline_detach_ancestor_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor::Options;
+    use crate::tenant::timeline::detach_ancestor;
+    use pageserver_api::models::detach_ancestor::AncestorDetached;
+
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1729,7 +1771,7 @@ async fn timeline_detach_ancestor_handler(
    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);

    async move {
-        let mut options = Options::default();
+        let mut options = detach_ancestor::Options::default();

        let rewrite_concurrency =
            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1757,27 +1799,36 @@ async fn timeline_detach_ancestor_handler(

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let (_guard, prepared) = timeline
+        let progress = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
            .await?;

-        let res = state
-            .tenant_manager
-            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
-            .await;
+        // uncomment to allow early as possible Tenant::drop
+        // drop(tenant);

-        match res {
-            Ok(reparented_timelines) => {
-                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
+        let resp = match progress {
+            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+                // it would be great to tag the guard on to the tenant activation future
+                let reparented_timelines = state
+                    .tenant_manager
+                    .complete_detaching_timeline_ancestor(
+                        tenant_shard_id,
+                        timeline_id,
+                        prepared,
+                        ctx,
+                    )
+                    .await
+                    .context("timeline detach ancestor completion")
+                    .map_err(ApiError::InternalServerError)?;
+
+                AncestorDetached {
                    reparented_timelines,
-                };
-
-                json_response(StatusCode::OK, resp)
+                }
            }
-            Err(e) => Err(ApiError::InternalServerError(
-                e.context("timeline detach completion"),
-            )),
-        }
+            detach_ancestor::Progress::Done(resp) => resp,
+        };
+
+        json_response(StatusCode::OK, resp)
    }
    .instrument(span)
    .await
@@ -2778,6 +2829,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
+            |r| api_handler(r, timeline_archival_config_handler),
+        )
        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,6 +13,7 @@ pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
 pub use pageserver_api::keyspace;
+use tokio_util::sync::CancellationToken;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
@@ -23,7 +24,6 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
-pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
@@ -33,7 +33,10 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
-use tenant::mgr::TenantManager;
+use tenant::{
+    mgr::{BackgroundPurges, TenantManager},
+    secondary,
+};
 use tracing::info;

 /// Current storage format version
@@ -55,17 +58,39 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

+pub struct CancellableTask {
+    pub task: tokio::task::JoinHandle<()>,
+    pub cancel: CancellationToken,
+}
+pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
+pub struct ConsumptionMetricsTasks(pub CancellableTask);
+pub struct DiskUsageEvictionTask(pub CancellableTask);
+impl CancellableTask {
+    pub async fn shutdown(self) {
+        self.cancel.cancel();
+        self.task.await.unwrap();
+    }
+}
+
 #[tracing::instrument(skip_all, fields(%exit_code))]
+#[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
+    http_listener: HttpEndpointListener,
+    libpq_listener: LibpqEndpointListener,
+    consumption_metrics_worker: ConsumptionMetricsTasks,
+    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
+    background_purges: BackgroundPurges,
    mut deletion_queue: DeletionQueue,
+    secondary_controller_tasks: secondary::GlobalTasks,
    exit_code: i32,
 ) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
+        libpq_listener.0.shutdown(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -92,16 +117,44 @@ pub async fn shutdown_pageserver(
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    deletion_queue.shutdown(Duration::from_secs(5)).await;

+    timed(
+        consumption_metrics_worker.0.shutdown(),
+        "shutdown consumption metrics",
+        Duration::from_secs(1),
+    )
+    .await;
+
+    timed(
+        futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())),
+        "shutdown disk usage eviction",
+        Duration::from_secs(1),
+    )
+    .await;
+
+    timed(
+        background_purges.shutdown(),
+        "shutdown background purges",
+        Duration::from_secs(1),
+    )
+    .await;
+
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
+        http_listener.0.shutdown(),
        "shutdown http",
        Duration::from_secs(1),
    )
    .await;

+    timed(
+        secondary_controller_tasks.wait(), // cancellation happened in caller
+        "secondary controller wait",
+        Duration::from_secs(1),
+    )
+    .await;
+
    // There should be nothing left, but let's be sure
    timed(
        task_mgr::shutdown_tasks(None, None, None),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum MetricLayerKind {
+    Delta,
+    Image,
+}
+
+static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_bytes",
+        "Sum of layer physical sizes in bytes",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_count",
+        "Number of layers that exist",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_archive_size",
@@ -569,6 +594,38 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_broken",
+        "How many times a circuit breaker has broken"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_unbroken",
+        "How many times a circuit breaker has been un-broken (recovered)"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_total",
+        "Size of uncompressed data written into image layers"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_out_bytes_total",
+        "Size of compressed image layer written"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1474,7 +1531,6 @@ pub(crate) enum ComputeCommandKind {
    Basebackup,
    Fullbackup,
    LeaseLsn,
-    Show,
 }

 pub(crate) struct ComputeCommandCounters {
@@ -2126,6 +2182,10 @@ pub(crate) struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
+    pub(crate) layer_size_image: UIntGauge,
+    pub(crate) layer_count_image: UIntGauge,
+    pub(crate) layer_size_delta: UIntGauge,
+    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2208,6 +2268,42 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

+        let layer_size_image = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_count_image = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_size_delta = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
+        let layer_count_delta = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2262,6 +2358,10 @@ impl TimelineMetrics {
            last_record_gauge,
            pitr_history_size,
            archival_size,
+            layer_size_image,
+            layer_count_image,
+            layer_size_delta,
+            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2323,6 +2423,31 @@ impl TimelineMetrics {
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -36,7 +36,6 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
@@ -66,7 +65,6 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
-use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -126,7 +124,6 @@ pub async fn libpq_listener_main(
                    None,
                    None,
                    "serving compute connection task",
-                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
                        local_auth,
@@ -430,18 +427,6 @@ impl PageServerHandler {
            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
            .await?;

-        // Make request tracer if needed
-        let mut tracer = if tenant.get_trace_read_requests() {
-            let connection_id = ConnectionId::generate();
-            let path =
-                tenant
-                    .conf
-                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
-            Some(Tracer::new(path))
-        } else {
-            None
-        };
-
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -473,11 +458,6 @@ impl PageServerHandler {
            trace!("query: {copy_data_bytes:?}");
            fail::fail_point!("ps::handle-pagerequest-message");

-            // Trace request if needed
-            if let Some(t) = tracer.as_mut() {
-                t.trace(&copy_data_bytes)
-            }
-
            let neon_fe_msg =
                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

@@ -1498,66 +1478,6 @@ where
                    ))?
                }
            };
-        } else if let Some(params) = parts.strip_prefix(&["show"]) {
-            // show <tenant_id>
-            if params.len() != 1 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for config command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-
-            tracing::Span::current().record("tenant_id", field::display(tenant_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Show)
-                .inc();
-
-            let tenant = self
-                .get_active_tenant_with_timeout(
-                    tenant_id,
-                    ShardSelector::Zero,
-                    ACTIVE_TENANT_TIMEOUT,
-                )
-                .await?;
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"checkpoint_distance"),
-                RowDescriptor::int8_col(b"checkpoint_timeout"),
-                RowDescriptor::int8_col(b"compaction_target_size"),
-                RowDescriptor::int8_col(b"compaction_period"),
-                RowDescriptor::int8_col(b"compaction_threshold"),
-                RowDescriptor::int8_col(b"gc_horizon"),
-                RowDescriptor::int8_col(b"gc_period"),
-                RowDescriptor::int8_col(b"image_creation_threshold"),
-                RowDescriptor::int8_col(b"pitr_interval"),
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_checkpoint_timeout()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_compaction_period()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
-                Some(tenant.get_gc_horizon().to_string().as_bytes()),
-                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
-                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
-                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
-            ]))?
-            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            return Err(QueryError::Other(anyhow::anyhow!(
                "unknown command {query_string}"
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -284,6 +284,16 @@ impl Timeline {
        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
            return Ok(true);
        }
+        // then check if the database was already initialized.
+        // get_rel_exists can be called before dbdir is created.
+        let buf = version.get(self, DBDIR_KEY, ctx).await?;
+        let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => Ok(dir.dbdirs),
+            Err(e) => Err(PageReconstructError::from(e)),
+        }?;
+        if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
+            return Ok(false);
+        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
        let buf = version.get(self, key, ctx).await?;
@@ -522,7 +532,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Option<TimestampTz>, PageReconstructError> {
        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
            if let Some(max_prev) = max {
                max = Some(max_prev.max(timestamp));
            } else {
@@ -2031,7 +2041,7 @@ mod tests {
    #[tokio::test]
    async fn aux_files_round_trip() -> anyhow::Result<()> {
        let name = "aux_files_round_trip";
-        let harness = TenantHarness::create(name)?;
+        let harness = TenantHarness::create(name).await?;

        pub const TIMELINE_ID: TimelineId =
            TimelineId::from_array(hex!("11223344556677881122334455667788"));
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -408,7 +408,6 @@ pub fn spawn<F>(
    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    name: &str,
-    shutdown_process_on_error: bool,
    future: F,
 ) -> PageserverTaskId
 where
@@ -437,7 +436,6 @@ where
        task_id,
        task_cloned,
        cancel,
-        shutdown_process_on_error,
        future,
    ));
    task_mut.join_handle = Some(join_handle);
@@ -454,82 +452,78 @@ async fn task_wrapper<F>(
    task_id: u64,
    task: Arc<PageServerTask>,
    shutdown_token: CancellationToken,
-    shutdown_process_on_error: bool,
    future: F,
 ) where
    F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
    debug!("Starting task '{}'", task_name);

-    let result = SHUTDOWN_TOKEN
-        .scope(
-            shutdown_token,
-            CURRENT_TASK.scope(task, {
-                // We use AssertUnwindSafe here so that the payload function
-                // doesn't need to be UnwindSafe. We don't do anything after the
-                // unwinding that would expose us to unwind-unsafe behavior.
-                AssertUnwindSafe(future).catch_unwind()
-            }),
-        )
-        .await;
-    task_finish(result, task_name, task_id, shutdown_process_on_error).await;
-}
-
-async fn task_finish(
-    result: std::result::Result<
-        anyhow::Result<()>,
-        std::boxed::Box<dyn std::any::Any + std::marker::Send>,
-    >,
-    task_name: String,
-    task_id: u64,
-    shutdown_process_on_error: bool,
-) {
-    // Remove our entry from the global hashmap.
-    let task = TASKS
-        .lock()
-        .unwrap()
-        .remove(&task_id)
-        .expect("no task in registry");
-
-    let mut shutdown_process = false;
-    {
+    // wrap the future so we log panics and errors
+    let tenant_shard_id = task.tenant_shard_id;
+    let timeline_id = task.timeline_id;
+    let fut = async move {
+        // We use AssertUnwindSafe here so that the payload function
+        // doesn't need to be UnwindSafe. We don't do anything after the
+        // unwinding that would expose us to unwind-unsafe behavior.
+        let result = AssertUnwindSafe(future).catch_unwind().await;
        match result {
            Ok(Ok(())) => {
                debug!("Task '{}' exited normally", task_name);
            }
            Ok(Err(err)) => {
-                if shutdown_process_on_error {
-                    error!(
-                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                    shutdown_process = true;
-                } else {
-                    error!(
-                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                }
+                error!(
+                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                    task_name, tenant_shard_id, timeline_id, err
+                );
            }
            Err(err) => {
-                if shutdown_process_on_error {
-                    error!(
-                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                    shutdown_process = true;
-                } else {
-                    error!(
-                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                }
+                error!(
+                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                    task_name, tenant_shard_id, timeline_id, err
+                );
            }
        }
-    }
+    };

-    if shutdown_process {
-        std::process::exit(1);
+    // add the task-locals
+    let fut = CURRENT_TASK.scope(task, fut);
+    let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut);
+
+    // poll future to completion
+    fut.await;
+
+    // Remove our entry from the global hashmap.
+    TASKS
+        .lock()
+        .unwrap()
+        .remove(&task_id)
+        .expect("no task in registry");
+}
+
+pub async fn exit_on_panic_or_error<T, E>(
+    task_name: &'static str,
+    future: impl Future<Output = Result<T, E>>,
+) -> T
+where
+    E: std::fmt::Debug,
+{
+    // We use AssertUnwindSafe here so that the payload function
+    // doesn't need to be UnwindSafe. We don't do anything after the
+    // unwinding that would expose us to unwind-unsafe behavior.
+    let result = AssertUnwindSafe(future).catch_unwind().await;
+    match result {
+        Ok(Ok(val)) => val,
+        Ok(Err(err)) => {
+            error!(
+                task_name,
+                "Task exited with error, exiting process: {err:?}"
+            );
+            std::process::exit(1);
+        }
+        Err(panic_obj) => {
+            error!(task_name, "Task panicked, exiting process: {panic_obj:?}");
+            std::process::exit(1);
+        }
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> {
 }

 /// Reserved bits for length and compression
-const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;

 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
 const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;

-const BYTE_UNCOMPRESSED: u8 = 0x80;
-const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
+pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;

 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
@@ -390,51 +390,63 @@ impl BlobWriter<false> {
 }

 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
    use super::*;
    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
+    use camino::Utf8PathBuf;
+    use camino_tempfile::Utf8TempDir;
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        round_trip_test_compressed::<BUFFERED>(blobs, false).await
    }

-    async fn round_trip_test_compressed<const BUFFERED: bool>(
+    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
        blobs: &[Vec<u8>],
        compression: bool,
-    ) -> Result<(), Error> {
+        ctx: &RequestContext,
+    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
                    wtr.write_blob_maybe_compressed(
                        blob.clone(),
-                        &ctx,
+                        ctx,
                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
                    )
                    .await
                } else {
-                    wtr.write_blob(blob.clone(), &ctx).await
+                    wtr.write_blob(blob.clone(), ctx).await
                };
                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(&ctx).await?;
+            wtr.flush_buffer(ctx).await?;
        }
+        Ok((temp_dir, pathbuf, offsets))
+    }

-        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
+        blobs: &[Vec<u8>],
+        compression: bool,
+    ) -> Result<(), Error> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let (_temp_dir, pathbuf, offsets) =
+            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
+
+        let file = VirtualFile::open(pathbuf, &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new_with_compression(rdr, compression);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -447,7 +459,7 @@ mod tests {
        Ok(())
    }

-    fn random_array(len: usize) -> Vec<u8> {
+    pub(crate) fn random_array(len: usize) -> Vec<u8> {
        let mut rng = rand::thread_rng();
        (0..len).map(|_| rng.gen()).collect::<_>()
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -335,7 +335,6 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
-    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
    pub min_resident_size_override: Option<u64>,
    // See the corresponding metric's help string.
@@ -436,10 +435,6 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub max_lsn_wal_lag: Option<NonZeroU64>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub trace_read_requests: Option<bool>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub eviction_policy: Option<EvictionPolicy>,
@@ -519,9 +514,6 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
-            trace_read_requests: self
-                .trace_read_requests
-                .unwrap_or(global_conf.trace_read_requests),
            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
            min_resident_size_override: self
                .min_resident_size_override
@@ -581,7 +573,6 @@ impl Default for TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
-            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -659,7 +650,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
            max_lsn_wal_lag: value.max_lsn_wal_lag,
-            trace_read_requests: value.trace_read_requests,
            eviction_policy: value.eviction_policy,
            min_resident_size_override: value.min_resident_size_override,
            evictions_low_residence_duration_metric_threshold: value
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where

    pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
    where
-        R: 'a,
+        R: 'a + Send,
    {
        DiskBtreeIterator {
            stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
 pub struct DiskBtreeIterator<'a> {
    #[allow(clippy::type_complexity)]
    stream: std::pin::Pin<
-        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
    >,
 }

@@ -550,10 +550,10 @@ where
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
    /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
-    ///   So because other methods cannot see the intermediate state invariant still holds.
+    ///    So because other methods cannot see the intermediate state invariant still holds.
    /// 2. `Self::finish`. It consumes self and does not return it back,
-    ///  which means that this is where the structure is destroyed.
-    ///  Thus stack of zero length cannot be observed by other methods.
+    ///    which means that this is where the structure is destroyed.
+    ///    Thus stack of zero length cannot be observed by other methods.
    stack: Vec<BuildNode<L>>,

    /// Last key that was appended to the tree. Used to sanity check that append
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -463,7 +463,7 @@ impl LayerMap {
    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc) {
+        if Self::is_l0(&layer_desc.key_range) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

@@ -482,7 +482,7 @@ impl LayerMap {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(layer_desc) {
+        if Self::is_l0(&layer_desc.key_range) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -598,8 +598,9 @@ impl LayerMap {
        coverage
    }

-    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
-        layer.get_key_range() == (Key::MIN..Key::MAX)
+    /// Check if the key range resembles that of an L0 layer.
+    pub fn is_l0(key_range: &Range<Key>) -> bool {
+        key_range == &(Key::MIN..Key::MAX)
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -626,7 +627,7 @@ impl LayerMap {
    ///      than just the current partition_range.
    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
-        if !Self::is_l0(layer) {
+        if !Self::is_l0(&layer.key_range) {
            return true;
        }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -36,7 +36,7 @@ use crate::control_plane_client::{
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
-use crate::task_mgr::{self, TaskKind};
+use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
@@ -225,26 +225,98 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
    Ok(tmp_path)
 }

-/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-/// the background, and thereby avoid blocking any API requests on this deletion completing.
-fn spawn_background_purge(tmp_path: Utf8PathBuf) {
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
+/// See [`Self::spawn`].
+#[derive(Clone)]
+pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
+enum BackgroundPurgesInner {
+    Open(tokio::task::JoinSet<()>),
+    // we use the async mutex for coalescing
+    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
+}

-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
+impl Default for BackgroundPurges {
+    fn default() -> Self {
+        Self(Arc::new(std::sync::Mutex::new(
+            BackgroundPurgesInner::Open(JoinSet::new()),
+        )))
+    }
+}
+
+impl BackgroundPurges {
+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    ///
+    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
+    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
+        let mut guard = self.0.lock().unwrap();
+        let jset = match &mut *guard {
+            BackgroundPurgesInner::Open(ref mut jset) => jset,
+            BackgroundPurgesInner::ShuttingDown(_) => {
+                warn!("trying to spawn background purge during shutdown, ignoring");
+                return;
+            }
+        };
+        jset.spawn_on(
+            async move {
+                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
+                    // should we fatal_io_error here?
+                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
+                }
+            }
+            .instrument(info_span!(parent: None, "background_purge")),
+            BACKGROUND_RUNTIME.handle(),
+        );
+    }
+
+    /// When this future completes, all background purges have completed.
+    /// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`].
+    ///
+    /// Concurrent calls will coalesce.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// If this future is dropped before polled to completion, concurrent and subsequent
+    /// instances of this future will continue to be correct.
+    #[instrument(skip_all)]
+    pub async fn shutdown(&self) {
+        let jset = {
+            let mut guard = self.0.lock().unwrap();
+            match &mut *guard {
+                BackgroundPurgesInner::Open(jset) => {
+                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
+                        std::mem::take(jset),
+                    )))
+                }
+                BackgroundPurgesInner::ShuttingDown(_) => {
+                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
+                    warn!("already shutting down");
+                }
+            };
+            match &mut *guard {
+                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
+                BackgroundPurgesInner::Open(_) => {
+                    unreachable!("above code transitions into shut down state");
+                }
+            }
+        };
+        let mut jset = jset.lock().await; // concurrent callers coalesce here
+        while let Some(res) = jset.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(e) if e.is_panic() => {
+                    // If it panicked, the error is already logged by the panic hook.
+                }
+                Err(e) if e.is_cancelled() => {
+                    unreachable!("we don't cancel the joinset or runtime")
+                }
+                Err(e) => {
+                    // No idea when this can happen, but let's log it.
+                    warn!(%e, "background purge task failed or panicked");
+                }
+            }
+        }
+    }
 }

 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
@@ -270,6 +342,8 @@ pub struct TenantManager {
    // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
    // when the tenant detaches.
    cancel: CancellationToken,
+
+    background_purges: BackgroundPurges,
 }

 fn emergency_generations(
@@ -447,6 +521,7 @@ pub(crate) enum DeleteTenantError {
 #[instrument(skip_all)]
 pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
+    background_purges: BackgroundPurges,
    resources: TenantSharedResources,
    init_order: InitializationOrder,
    cancel: CancellationToken,
@@ -512,7 +587,7 @@ pub async fn init_tenant_mgr(

                    match safe_rename_tenant_dir(&tenant_dir_path).await {
                        Ok(tmp_path) => {
-                            spawn_background_purge(tmp_path);
+                            background_purges.spawn(tmp_path);
                        }
                        Err(e) => {
                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
@@ -634,6 +709,7 @@ pub async fn init_tenant_mgr(
        tenants: &TENANTS,
        resources,
        cancel: CancellationToken::new(),
+        background_purges,
    })
 }

@@ -1353,6 +1429,7 @@ impl TenantManager {

        async fn delete_local(
            conf: &PageServerConf,
+            background_purges: &BackgroundPurges,
            tenant_shard_id: &TenantShardId,
        ) -> anyhow::Result<()> {
            let local_tenant_directory = conf.tenant_path(tenant_shard_id);
@@ -1361,7 +1438,7 @@ impl TenantManager {
                .with_context(|| {
                    format!("local tenant directory {local_tenant_directory:?} rename")
                })?;
-            spawn_background_purge(tmp_dir);
+            background_purges.spawn(tmp_dir);
            Ok(())
        }

@@ -1379,12 +1456,12 @@ impl TenantManager {
                        barrier.wait().await;
                    }
                }
-                delete_local(self.conf, &tenant_shard_id).await?;
+                delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
            }
            Some(TenantSlot::Secondary(secondary_tenant)) => {
                secondary_tenant.shutdown().await;

-                delete_local(self.conf, &tenant_shard_id).await?;
+                delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
            }
            Some(TenantSlot::InProgress(_)) => unreachable!(),
            None => {}
@@ -1655,7 +1732,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        spawn_background_purge(tmp_path);
+        self.background_purges.spawn(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1831,7 +1908,7 @@ impl TenantManager {
        let tmp_path = self
            .detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
            .await?;
-        spawn_background_purge(tmp_path);
+        self.background_purges.spawn(tmp_path);

        Ok(())
    }
@@ -2698,7 +2775,9 @@ mod tests {
        // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
        // wait for it to complete before proceeding.

-        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
+        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
+            .await
+            .unwrap();
        let (t, _ctx) = h.load().await;

        // harness loads it to active, which is forced and nothing is running on the tenant
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,7 +241,7 @@ use self::index::IndexPart;

 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
-use super::upload_queue::SetDeletedFlagProgress;
+use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;

 pub(crate) use download::{
@@ -1525,7 +1525,6 @@ impl RemoteTimelineClient {
                Some(self.tenant_shard_id),
                Some(self.timeline_id),
                "remote upload",
-                false,
                async move {
                    self_rc.perform_upload_task(task).await;
                    Ok(())
@@ -1930,6 +1929,31 @@ impl RemoteTimelineClient {
            }
        }
    }
+
+    /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
+    /// externally to RemoteTimelineClient.
+    pub(crate) fn initialized_upload_queue(
+        &self,
+    ) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
+        let mut inner = self.upload_queue.lock().unwrap();
+        inner.initialized_mut()?;
+        Ok(UploadQueueAccessor { inner })
+    }
+}
+
+pub(crate) struct UploadQueueAccessor<'a> {
+    inner: std::sync::MutexGuard<'a, UploadQueue>,
+}
+
+impl<'a> UploadQueueAccessor<'a> {
+    pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
+        match &*self.inner {
+            UploadQueue::Initialized(x) => &x.clean.0,
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                unreachable!("checked before constructing")
+            }
+        }
+    }
 }

 pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -2103,7 +2127,7 @@ mod tests {
    impl TestSetup {
        async fn new(test_name: &str) -> anyhow::Result<Self> {
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
-            let harness = TenantHarness::create(test_name)?;
+            let harness = TenantHarness::create(test_name).await?;
            let (tenant, ctx) = harness.load().await;

            let timeline = tenant
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
    ///
    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    // FIXME: this is insufficient even for path of two timelines for future wal recovery
+    // purposes:
+    //
+    // assuming a "old main" which has received most of the WAL, and has a branch "new main",
+    // starting a bit before "old main" last_record_lsn. the current version works fine,
+    // because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
+    //
+    // then assuming "new main" would similarly receive a branch right before its last_record_lsn,
+    // "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
+    // here. however, we cannot recover from WAL using only that information, we would need the
+    // whole ancestry here:
+    //
+    // ```json
+    // [
+    //   ["old main", ancestor_lsn("new main"), _],
+    //   ["new main", ancestor_lsn("new new main"), _]
+    // ]
+    // ```
    #[serde(skip_serializing_if = "Option::is_none", default)]
    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
 }
@@ -217,6 +235,14 @@ impl Lineage {
        self.original_ancestor
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }
+
+    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+        self.original_ancestor.is_some()
+    }
+
+    pub(crate) fn is_reparented(&self) -> bool {
+        !self.reparenting_history.is_empty()
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -31,6 +31,7 @@ use pageserver_api::{
 };
 use remote_storage::GenericRemoteStorage;

+use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
@@ -293,15 +294,50 @@ impl SecondaryController {
    }
 }

+pub struct GlobalTasks {
+    cancel: CancellationToken,
+    uploader: JoinHandle<()>,
+    downloader: JoinHandle<()>,
+}
+
+impl GlobalTasks {
+    /// Caller is responsible for requesting shutdown via the cancellation token that was
+    /// passed to [`spawn_tasks`].
+    ///
+    /// # Panics
+    ///
+    /// This method panics if that token is not cancelled.
+    /// This is low-risk because we're calling this during process shutdown, so, a panic
+    /// will be informative but not cause undue downtime.
+    pub async fn wait(self) {
+        let Self {
+            cancel,
+            uploader,
+            downloader,
+        } = self;
+        assert!(
+            cancel.is_cancelled(),
+            "must cancel cancellation token, otherwise the tasks will not shut down"
+        );
+
+        let (uploader, downloader) = futures::future::join(uploader, downloader).await;
+        uploader.expect(
+            "unreachable: exit_on_panic_or_error would catch the panic and exit the process",
+        );
+        downloader.expect(
+            "unreachable: exit_on_panic_or_error would catch the panic and exit the process",
+        );
+    }
+}
+
 pub fn spawn_tasks(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
    background_jobs_can_start: Barrier,
    cancel: CancellationToken,
-) -> SecondaryController {
+) -> (SecondaryController, GlobalTasks) {
    let mgr_clone = tenant_manager.clone();
    let storage_clone = remote_storage.clone();
-    let cancel_clone = cancel.clone();
    let bg_jobs_clone = background_jobs_can_start.clone();

    let (download_req_tx, download_req_rx) =
@@ -309,17 +345,9 @@ pub fn spawn_tasks(
    let (upload_req_tx, upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);

-    let downloader_task_ctx = RequestContext::new(
-        TaskKind::SecondaryDownloads,
-        crate::context::DownloadBehavior::Download,
-    );
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        downloader_task_ctx.task_kind(),
-        None,
-        None,
+    let cancel_clone = cancel.clone();
+    let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
        "secondary tenant downloads",
-        false,
        async move {
            downloader_task(
                mgr_clone,
@@ -327,49 +355,41 @@ pub fn spawn_tasks(
                download_req_rx,
                bg_jobs_clone,
                cancel_clone,
-                downloader_task_ctx,
+                RequestContext::new(
+                    TaskKind::SecondaryDownloads,
+                    crate::context::DownloadBehavior::Download,
+                ),
            )
            .await;
-
-            Ok(())
+            anyhow::Ok(())
        },
-    );
+    ));

-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryUploads,
-        None,
-        None,
+    let cancel_clone = cancel.clone();
+    let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
        "heatmap uploads",
-        false,
        async move {
            heatmap_uploader_task(
                tenant_manager,
                remote_storage,
                upload_req_rx,
                background_jobs_can_start,
-                cancel,
+                cancel_clone,
            )
            .await;
-
-            Ok(())
+            anyhow::Ok(())
        },
-    );
+    ));

-    SecondaryController {
-        download_req_tx,
-        upload_req_tx,
-    }
-}
-
-/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
-pub fn null_controller() -> SecondaryController {
-    let (download_req_tx, _download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
-    let (upload_req_tx, _upload_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController {
-        upload_req_tx,
-        download_req_tx,
-    }
+    (
+        SecondaryController {
+            upload_req_tx,
+            download_req_tx,
+        },
+        GlobalTasks {
+            cancel,
+            uploader,
+            downloader,
+        },
+    )
 }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -135,11 +135,9 @@ pub struct TimelineInputs {
    ancestor_lsn: Lsn,
    last_record: Lsn,
    latest_gc_cutoff: Lsn,
-    horizon_cutoff: Lsn,
-    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
-    next_gc_cutoff: Lsn,
+    next_pitr_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
    retention_param_cutoff: Option<Lsn>,
@@ -150,7 +148,7 @@ pub struct TimelineInputs {

 /// Gathers the inputs for the tenant sizing model.
 ///
-/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
+/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
 /// [`TimelineInputs::latest_gc_cutoff`].
 ///
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
 ///
 /// ```text
 /// 0-----|---------|----|------------| · · · · · |·> lsn
-///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
+///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
-///
-/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
-/// tenant size will be zero.
 pub(super) async fn gather_inputs(
    tenant: &Tenant,
    limit: &Arc<Semaphore>,
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> Result<ModelInputs, CalculateSyntheticSizeError> {
-    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    // refresh is needed to update [`timeline::GcCutoffs`]
    tenant.refresh_gc_info(cancel, ctx).await?;

    // Collect information about all the timelines
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
        // actually removing files.
        //
-        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
+        // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
-        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
+        // than our internal space cutoff.  This means that if someone drops a database and waits for their
        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
-        // horizon_cutoff.
-        let pitr_cutoff = gc_info.cutoffs.pitr;
-        let horizon_cutoff = gc_info.cutoffs.horizon;
-        let mut next_gc_cutoff = pitr_cutoff;
+        // the space cutoff.
+        let mut next_pitr_cutoff = gc_info.cutoffs.time;

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
            let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
-            if next_gc_cutoff < param_cutoff {
-                next_gc_cutoff = param_cutoff;
+            if next_pitr_cutoff < param_cutoff {
+                next_pitr_cutoff = param_cutoff;
            }
            Some(param_cutoff)
        } else {
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
            .copied()
            .collect::<Vec<_>>();

-        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+        // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
        // want to query any logical size before initdb_lsn.
        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);

@@ -271,10 +264,10 @@ pub(super) async fn gather_inputs(
        let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
            .retain_lsns
            .iter()
-            .filter(|&&lsn| lsn > ancestor_lsn)
+            .filter(|(lsn, _child_id)| lsn > &ancestor_lsn)
            .copied()
            // this assumes there are no other retain_lsns than the branchpoints
-            .map(|lsn| (lsn, LsnKind::BranchPoint))
+            .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint))
            .collect::<Vec<_>>();

        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
@@ -291,10 +284,10 @@ pub(super) async fn gather_inputs(
            )
        }

-        // Add a point for the GC cutoff
-        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
+        // Add a point for the PITR cutoff
+        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
        if !branch_start_needed {
-            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
+            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
        }

        lsns.sort_unstable();
@@ -333,7 +326,7 @@ pub(super) async fn gather_inputs(
                    parent: Some(parent),
                    lsn: lsn.0,
                    size: None,
-                    needed: lsn > next_gc_cutoff,
+                    needed: lsn > next_pitr_cutoff,
                },
                timeline_id: timeline.timeline_id,
                kind,
@@ -357,8 +350,8 @@ pub(super) async fn gather_inputs(
                    segment: Segment {
                        parent: Some(lease_parent),
                        lsn: lsn.0,
-                        size: None,                   // Filled in later, if necessary
-                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
+                        size: None,                     // Filled in later, if necessary
+                        needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
                    },
                    timeline_id: timeline.timeline_id,
                    kind: LsnKind::LeaseStart,
@@ -398,9 +391,7 @@ pub(super) async fn gather_inputs(
            last_record: last_record_lsn,
            // this is not used above, because it might not have updated recently enough
            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            horizon_cutoff,
-            pitr_cutoff,
-            next_gc_cutoff,
+            next_pitr_cutoff,
            retention_param_cutoff,
            lease_points,
        });
@@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/18D3D98",
      "last_record": "0/2230CD0",
      "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/2210CD0",
-      "pitr_cutoff": "0/2210CD0",
-      "next_gc_cutoff": "0/2210CD0",
+      "next_pitr_cutoff": "0/2210CD0",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/176D998",
      "last_record": "0/1837770",
      "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/1817770",
-      "pitr_cutoff": "0/1817770",
-      "next_gc_cutoff": "0/1817770",
+      "next_pitr_cutoff": "0/1817770",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/0",
      "last_record": "0/18D3D98",
      "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/18B3D98",
-      "pitr_cutoff": "0/18B3D98",
-      "next_gc_cutoff": "0/18B3D98",
+      "next_pitr_cutoff": "0/18B3D98",
      "retention_param_cutoff": null,
      "lease_points": []
    }
@@ -820,9 +805,7 @@ fn verify_size_for_one_branch() {
      "ancestor_lsn": "0/0",
      "last_record": "47/280A5860",
      "latest_gc_cutoff": "47/240A5860",
-      "horizon_cutoff": "47/240A5860",
-      "pitr_cutoff": "47/240A5860",
-      "next_gc_cutoff": "47/240A5860",
+      "next_pitr_cutoff": "47/240A5860",
      "retention_param_cutoff": "0/0",
      "lease_points": []
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -6,35 +6,22 @@ pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
-
-#[cfg(test)]
 pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
-use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
-use enum_map::EnumMap;
-use enumset::EnumSet;
-use once_cell::sync::Lazy;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
-use pageserver_api::models::{
-    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
-};
-use std::borrow::Cow;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
-use tracing::warn;
-use utils::history_buffer::HistoryBufferWithDropCounter;
-use utils::rate_limit::RateLimit;

-use utils::{id::TimelineId, lsn::Lsn};
+use utils::lsn::Lsn;

 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
@@ -77,9 +64,9 @@ where
 /// call, to collect more records.
 ///
 #[derive(Debug, Default)]
-pub struct ValueReconstructState {
-    pub records: Vec<(Lsn, NeonWalRecord)>,
-    pub img: Option<(Lsn, Bytes)>,
+pub(crate) struct ValueReconstructState {
+    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
+    pub(crate) img: Option<(Lsn, Bytes)>,
 }

 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
@@ -460,94 +447,92 @@ pub enum ValueReconstructResult {
    Missing,
 }

-#[derive(Debug)]
-pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
-
-/// This struct holds two instances of [`LayerAccessStatsInner`].
-/// Accesses are recorded to both instances.
-/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
-/// The `for_eviction_policy` is never reset.
-#[derive(Debug, Default, Clone)]
-struct LayerAccessStatsLocked {
-    for_scraping_api: LayerAccessStatsInner,
-    for_eviction_policy: LayerAccessStatsInner,
+/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
+/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
+/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
+/// be used for cache management but not for correctness-critical checks.
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
+pub(crate) enum LayerVisibilityHint {
+    /// A Visible layer might be read while serving a read, because there is not an image layer between it
+    /// and a readable LSN (the tip of the branch or a child's branch point)
+    Visible,
+    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
+    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
+    #[allow(unused)]
+    Covered,
+    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
+    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
+    /// state is for when existing layers are constructed while loading a timeline.
+    #[default]
+    Uninitialized,
 }

-impl LayerAccessStatsLocked {
-    fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
-        [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
-    }
-}
-
-#[derive(Debug, Default, Clone)]
-struct LayerAccessStatsInner {
-    first_access: Option<LayerAccessStatFullDetails>,
-    count_by_access_kind: EnumMap<LayerAccessKind, u64>,
-    task_kind_flag: EnumSet<TaskKind>,
-    last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
-    last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(crate) struct LayerAccessStatFullDetails {
-    pub(crate) when: SystemTime,
-    pub(crate) task_kind: TaskKind,
-    pub(crate) access_kind: LayerAccessKind,
-}
+pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);

 #[derive(Clone, Copy, strum_macros::EnumString)]
-pub enum LayerAccessStatsReset {
+pub(crate) enum LayerAccessStatsReset {
    NoReset,
-    JustTaskKindFlags,
    AllStats,
 }

-fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
-    ts.duration_since(UNIX_EPOCH)
-        .expect("better to die in this unlikely case than report false stats")
-        .as_millis()
-        .try_into()
-        .expect("64 bits is enough for few more years")
-}
+impl Default for LayerAccessStats {
+    fn default() -> Self {
+        // Default value is to assume resident since creation time, and visible.
+        let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now());
+        value |= 0x1 << Self::VISIBILITY_SHIFT;

-impl LayerAccessStatFullDetails {
-    fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
-        let Self {
-            when,
-            task_kind,
-            access_kind,
-        } = self;
-        pageserver_api::models::LayerAccessStatFullDetails {
-            when_millis_since_epoch: system_time_to_millis_since_epoch(when),
-            task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
-            access_kind: *access_kind,
-        }
+        Self(std::sync::atomic::AtomicU64::new(value))
    }
 }

+// Efficient store of two very-low-resolution timestamps and some bits.  Used for storing last access time and
+// last residence change time.
 impl LayerAccessStats {
-    /// Create an empty stats object.
-    ///
-    /// The caller is responsible for recording a residence event
-    /// using [`record_residence_event`] before calling `latest_activity`.
-    /// If they don't, [`latest_activity`] will return `None`.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    /// [`latest_activity`]: Self::latest_activity
-    pub(crate) fn empty_will_record_residence_event_later() -> Self {
-        LayerAccessStats(Mutex::default())
+    // How many high bits to drop from a u32 timestamp?
+    // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use
+    //   after that, this software has been very successful!)
+    // - Dropping the top bit is implicitly safe because unix timestamps are meant to be
+    // stored in an i32, so they never used it.
+    // - Dropping the next two bits is safe because this code is only running on systems in
+    // years >= 2024, and these bits have been 1 since 2021
+    //
+    // Therefore we may store only 28 bits for a timestamp with one second resolution.  We do
+    // this truncation to make space for some flags in the high bits of our u64.
+    const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1;
+    const TS_MASK: u32 = 0x1f_ff_ff_ff;
+    const TS_ONES: u32 = 0x60_00_00_00;
+
+    const ATIME_SHIFT: u32 = 0;
+    const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS;
+    const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS;
+
+    fn write_bits(&self, mask: u64, value: u64) -> u64 {
+        self.0
+            .fetch_update(
+                // TODO: decide what orderings are correct
+                std::sync::atomic::Ordering::Relaxed,
+                std::sync::atomic::Ordering::Relaxed,
+                |v| Some((v & !mask) | (value & mask)),
+            )
+            .expect("Inner function is infallible")
    }

-    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
-        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
-        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
-        new
+    fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) {
+        // Drop the low three bits of the timestamp, for an ~8s accuracy
+        let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64);
+
+        ((Self::TS_MASK as u64) << shift, timestamp << shift)
+    }
+
+    fn read_low_res_timestamp(&self, shift: u32) -> Option<SystemTime> {
+        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
+
+        let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift;
+        if ts_bits == 0 {
+            None
+        } else {
+            Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64)))
+        }
    }

    /// Record a change in layer residency.
@@ -563,129 +548,112 @@ impl LayerAccessStats {
    /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
-    ///
-    pub(crate) fn record_residence_event(
-        &self,
-        status: LayerResidenceStatus,
-        reason: LayerResidenceEventReason,
-    ) {
-        let mut locked = self.0.lock().unwrap();
-        locked.iter_mut().for_each(|inner| {
-            inner
-                .last_residence_changes
-                .write(LayerResidenceEvent::new(status, reason))
-        });
+    pub(crate) fn record_residence_event_at(&self, now: SystemTime) {
+        let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now);
+        self.write_bits(mask, value);
    }

-    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+    pub(crate) fn record_residence_event(&self) {
+        self.record_residence_event_at(SystemTime::now())
+    }
+
+    pub(crate) fn record_access_at(&self, now: SystemTime) {
+        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
+
+        // A layer which is accessed must be visible.
+        mask |= 0x1 << Self::VISIBILITY_SHIFT;
+        value |= 0x1 << Self::VISIBILITY_SHIFT;
+
+        self.write_bits(mask, value);
+    }
+
+    pub(crate) fn record_access(&self, ctx: &RequestContext) {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
            return;
        }

-        let this_access = LayerAccessStatFullDetails {
-            when: SystemTime::now(),
-            task_kind: ctx.task_kind(),
-            access_kind,
-        };
-
-        let mut locked = self.0.lock().unwrap();
-        locked.iter_mut().for_each(|inner| {
-            inner.first_access.get_or_insert(this_access);
-            inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= ctx.task_kind();
-            inner.last_accesses.write(this_access);
-        })
+        self.record_access_at(SystemTime::now())
    }

    fn as_api_model(
        &self,
        reset: LayerAccessStatsReset,
    ) -> pageserver_api::models::LayerAccessStats {
-        let mut locked = self.0.lock().unwrap();
-        let inner = &mut locked.for_scraping_api;
-        let LayerAccessStatsInner {
-            first_access,
-            count_by_access_kind,
-            task_kind_flag,
-            last_accesses,
-            last_residence_changes,
-        } = inner;
        let ret = pageserver_api::models::LayerAccessStats {
-            access_count_by_access_kind: count_by_access_kind
-                .iter()
-                .map(|(kind, count)| (kind, *count))
-                .collect(),
-            task_kind_access_flag: task_kind_flag
-                .iter()
-                .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
-                .collect(),
-            first: first_access.as_ref().map(|a| a.as_api_model()),
-            accesses_history: last_accesses.map(|m| m.as_api_model()),
-            residence_events_history: last_residence_changes.clone(),
+            access_time: self
+                .read_low_res_timestamp(Self::ATIME_SHIFT)
+                .unwrap_or(UNIX_EPOCH),
+            residence_time: self
+                .read_low_res_timestamp(Self::RTIME_SHIFT)
+                .unwrap_or(UNIX_EPOCH),
+            visible: matches!(self.visibility(), LayerVisibilityHint::Visible),
        };
        match reset {
-            LayerAccessStatsReset::NoReset => (),
-            LayerAccessStatsReset::JustTaskKindFlags => {
-                inner.task_kind_flag.clear();
-            }
+            LayerAccessStatsReset::NoReset => {}
            LayerAccessStatsReset::AllStats => {
-                *inner = LayerAccessStatsInner::default();
+                self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0);
+                self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0);
            }
        }
        ret
    }

-    /// Get the latest access timestamp, falling back to latest residence event, further falling
-    /// back to `SystemTime::now` for a usable timestamp for eviction.
-    pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
-        self.latest_activity().unwrap_or_else(SystemTime::now)
+    /// Get the latest access timestamp, falling back to latest residence event.  The latest residence event
+    /// will be this Layer's construction time, if its residence hasn't changed since then.
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) {
+            t
+        } else {
+            self.read_low_res_timestamp(Self::RTIME_SHIFT)
+                .expect("Residence time is set on construction")
+        }
    }

-    /// Get the latest access timestamp, falling back to latest residence event.
+    /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
    ///
-    /// This function can only return `None` if there has not yet been a call to the
-    /// [`record_residence_event`] method. That would generally be considered an
-    /// implementation error. This function logs a rate-limited warning in that case.
-    ///
-    /// TODO: use type system to avoid the need for `fallback`.
-    /// The approach in <https://github.com/neondatabase/neon/pull/3775>
-    /// could be used to enforce that a residence event is recorded
-    /// before a layer is added to the layer map. We could also have
-    /// a layer wrapper type that holds the LayerAccessStats, and ensure
-    /// that that type can only be produced by inserting into the layer map.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    fn latest_activity(&self) -> Option<SystemTime> {
-        let locked = self.0.lock().unwrap();
-        let inner = &locked.for_eviction_policy;
-        match inner.last_accesses.recent() {
-            Some(a) => Some(a.when),
-            None => match inner.last_residence_changes.recent() {
-                Some(e) => Some(e.timestamp),
-                None => {
-                    static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
-                        Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
-                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
-                    guard.0 += 1;
-                    let occurences = guard.0;
-                    guard.1.call(move || {
-                        warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
-                    });
-                    None
-                }
-            },
+    /// This indicates whether the layer has been used for some purpose that would motivate
+    /// us to keep it on disk, such as for serving a getpage request.
+    fn accessed(&self) -> bool {
+        // Consider it accessed if the most recent access is more recent than
+        // the most recent change in residence status.
+        match (
+            self.read_low_res_timestamp(Self::ATIME_SHIFT),
+            self.read_low_res_timestamp(Self::RTIME_SHIFT),
+        ) {
+            (None, _) => false,
+            (Some(_), None) => true,
+            (Some(a), Some(r)) => a >= r,
+        }
+    }
+
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
+        let value = match visibility {
+            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
+            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
+        };
+
+        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
+        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+            1 => LayerVisibilityHint::Visible,
+            0 => LayerVisibilityHint::Covered,
+            _ => unreachable!(),
        }
    }
 }

 /// Get a layer descriptor from a layer.
-pub trait AsLayerDesc {
+pub(crate) trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

 pub mod tests {
    use pageserver_api::shard::TenantShardId;
+    use utils::id::TimelineId;

    use super::*;

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -49,10 +52,11 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
+use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -261,7 +265,7 @@ impl DeltaLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(ctx).await?;

        inner.dump(ctx).await
    }
@@ -294,12 +298,8 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(access_kind, ctx);
+    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
+        self.access_stats.record_access(ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -352,7 +352,7 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -456,7 +456,12 @@ impl DeltaLayerWriterInner {
        will_init: bool,
        ctx: &RequestContext,
    ) -> (Vec<u8>, anyhow::Result<()>) {
-        assert!(self.lsn_range.start <= lsn);
+        assert!(
+            self.lsn_range.start <= lsn,
+            "lsn_start={}, lsn={}",
+            self.lsn_range.start,
+            lsn
+        );
        // We don't want to use compression in delta layer creation
        let compression = ImageCompressionAlgorithm::Disabled;
        let (val, res) = self
@@ -747,12 +752,10 @@ impl DeltaLayer {
 }

 impl DeltaLayerInner {
-    #[cfg(test)]
    pub(crate) fn key_range(&self) -> &Range<Key> {
        &self.layer_key_range
    }

-    #[cfg(test)]
    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
        &self.layer_lsn_range
    }
@@ -1180,9 +1183,7 @@ impl DeltaLayerInner {
                    let delta_key = DeltaKey::from_slice(key);
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
-                        )),
+                        layer: self,
                    };
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
@@ -1321,7 +1322,7 @@ impl DeltaLayerInner {
                        offsets.start.pos(),
                        offsets.end.pos(),
                        meta,
-                        Some(max_read_size),
+                        max_read_size,
                    ))
                }
            } else {
@@ -1426,7 +1427,7 @@ impl DeltaLayerInner {
        let keys = self.load_keys(ctx).await?;

        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let buf = val.load_raw(ctx).await?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -1461,8 +1462,7 @@ impl DeltaLayerInner {
            use pageserver_api::key::CHECKPOINT_KEY;
            use postgres_ffi::CheckPoint;
            if key == CHECKPOINT_KEY {
-                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-                let val = Value::des(&buf)?;
+                let val = val.load(ctx).await?;
                match val {
                    Value::Image(img) => {
                        let checkpoint = CheckPoint::decode(&img)?;
@@ -1515,7 +1515,6 @@ impl DeltaLayerInner {
        offset
    }

-    #[cfg(test)]
    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader =
@@ -1526,7 +1525,7 @@ impl DeltaLayerInner {
            index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
            key_values_batch: std::collections::VecDeque::new(),
            is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                1024,        // The default value. Unit tests might use a different value
            ),
@@ -1547,17 +1546,24 @@ pub struct DeltaEntry<'a> {
 /// Reference to an on-disk value
 pub struct ValueRef<'a> {
    blob_ref: BlobRef,
-    reader: BlockCursor<'a>,
+    layer: &'a DeltaLayerInner,
 }

 impl<'a> ValueRef<'a> {
    /// Loads the value from disk
    pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
-        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        let buf = self.load_raw(ctx).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
+
+    async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
+        let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
+            self.layer,
+        )));
+        let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        Ok(buf)
+    }
 }

 pub(crate) struct Adapter<T>(T);
@@ -1591,17 +1597,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
    }
 }

-#[cfg(test)]
 pub struct DeltaLayerIterator<'a> {
    delta_layer: &'a DeltaLayerInner,
    ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
    is_end: bool,
 }

-#[cfg(test)]
 impl<'a> DeltaLayerIterator<'a> {
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1615,13 +1619,17 @@ impl<'a> DeltaLayerIterator<'a> {
                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                let blob_ref = BlobRef(value);
                let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let data_end_offset = self.delta_layer.index_start_offset();
-                break self.planner.handle_range_end(data_end_offset);
+                if let Some(item) = self.planner.handle_range_end(data_end_offset) {
+                    break item;
+                } else {
+                    return Ok(()); // TODO: test empty iterator
+                }
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
@@ -1664,6 +1672,7 @@ pub(crate) mod test {
    use rand::RngCore;

    use super::*;
+    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
    use crate::tenant::Tenant;
@@ -1673,6 +1682,7 @@ pub(crate) mod test {
        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
        DEFAULT_PG_VERSION,
    };
+    use bytes::Bytes;

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -1925,7 +1935,7 @@ pub(crate) mod test {

    #[tokio::test]
    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
+        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
        let (tenant, ctx) = harness.load().await;

        let timeline_id = TimelineId::generate();
@@ -2025,7 +2035,9 @@ pub(crate) mod test {
        use crate::walrecord::NeonWalRecord;
        use bytes::Bytes;

-        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
+            .await
+            .unwrap();
        let (tenant, ctx) = h.load().await;
        let ctx = &ctx;
        let timeline = tenant
@@ -2241,6 +2253,15 @@ pub(crate) mod test {
        (k1, l1).cmp(&(k2, l2))
    }

+    pub(crate) fn sort_delta_value(
+        (k1, l1, v1): &(Key, Lsn, Value),
+        (k2, l2, v2): &(Key, Lsn, Value),
+    ) -> std::cmp::Ordering {
+        let order_1 = if v1.is_image() { 0 } else { 1 };
+        let order_2 = if v2.is_image() { 0 } else { 1 };
+        (k1, l1, order_1).cmp(&(k2, l2, order_2))
+    }
+
    pub(crate) async fn produce_delta_layer(
        tenant: &Tenant,
        tline: &Arc<Timeline>,
@@ -2249,7 +2270,7 @@ pub(crate) mod test {
    ) -> anyhow::Result<ResidentLayer> {
        deltas.sort_by(sort_delta);
        let (key_start, _, _) = deltas.first().unwrap();
-        let (key_max, _, _) = deltas.first().unwrap();
+        let (key_max, _, _) = deltas.last().unwrap();
        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
        let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
        let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2294,10 +2315,7 @@ pub(crate) mod test {

    #[tokio::test]
    async fn delta_layer_iterator() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
+        let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{
    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -46,10 +49,10 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -224,7 +227,7 @@ impl ImageLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(ctx).await?;

        inner.dump(ctx).await?;

@@ -251,12 +254,8 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(access_kind, ctx);
+    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -308,7 +307,7 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -369,12 +368,10 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
-    #[cfg(test)]
    pub(crate) fn key_range(&self) -> &Range<Key> {
        &self.key_range
    }

-    #[cfg(test)]
    pub(crate) fn lsn(&self) -> Lsn {
        self.lsn
    }
@@ -699,7 +696,6 @@ impl ImageLayerInner {
        }
    }

-    #[cfg(test)]
    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader =
@@ -708,9 +704,9 @@ impl ImageLayerInner {
            image_layer: self,
            ctx,
            index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
-            key_values_batch: std::collections::VecDeque::new(),
+            key_values_batch: VecDeque::new(),
            is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                1024,        // The default value. Unit tests might use a different value
            ),
@@ -737,6 +733,9 @@ struct ImageLayerWriterInner {
    key_range: Range<Key>,
    lsn: Lsn,

+    // Total uncompressed bytes passed into put_image
+    uncompressed_bytes: u64,
+
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -792,6 +791,7 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
+            uncompressed_bytes: 0,
        };

        Ok(writer)
@@ -809,7 +809,12 @@ impl ImageLayerWriterInner {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
-        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
+        let compression = self.conf.image_compression;
+        self.uncompressed_bytes += img.len() as u64;
+        let (_img, res) = self
+            .blob_writer
+            .write_blob_maybe_compressed(img, ctx, compression)
+            .await;
        // TODO: re-use the buffer for `img` further upstack
        let off = res?;

@@ -831,6 +836,11 @@ impl ImageLayerWriterInner {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

+        // Calculate compression ratio
+        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+
        let mut file = self.blob_writer.into_inner();

        // Write out the index
@@ -970,17 +980,15 @@ impl Drop for ImageLayerWriter {
    }
 }

-#[cfg(test)]
 pub struct ImageLayerIterator<'a> {
    image_layer: &'a ImageLayerInner,
    ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
    is_end: bool,
 }

-#[cfg(test)]
 impl<'a> ImageLayerIterator<'a> {
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -994,14 +1002,17 @@ impl<'a> ImageLayerIterator<'a> {
                    Key::from_slice(&raw_key[..KEY_SIZE]),
                    self.image_layer.lsn,
                    offset,
-                    BlobFlag::None,
                ) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
-                break self.planner.handle_range_end(payload_end);
+                if let Some(item) = self.planner.handle_range_end(payload_end) {
+                    break item;
+                } else {
+                    return Ok(()); // TODO: a test case on empty iterator
+                }
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
@@ -1095,6 +1106,7 @@ mod test {
            ShardIdentity::unsharded(),
            get_next_gen(),
        )
+        .await
        .unwrap();
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
@@ -1161,6 +1173,7 @@ mod test {
                // But here, all we care about is that the gen number is unique.
                get_next_gen(),
            )
+            .await
            .unwrap();
            let (tenant, ctx) = harness.load().await;
            let timeline = tenant
@@ -1292,7 +1305,7 @@ mod test {

    #[tokio::test]
    async fn image_layer_iterator() {
-        let harness = TenantHarness::create("image_layer_iterator").unwrap();
+        let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -18,7 +18,7 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, BinaryHeap, HashSet};
+use std::collections::BTreeMap;
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -375,15 +375,6 @@ impl InMemoryLayer {
        let inner = self.inner.read().await;
        let reader = inner.file.block_cursor();

-        #[derive(Eq, PartialEq, Ord, PartialOrd)]
-        struct BlockRead {
-            key: Key,
-            lsn: Lsn,
-            block_offset: u64,
-        }
-
-        let mut planned_block_reads = BinaryHeap::new();
-
        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner.index.range(range.start..range.end) {
                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
@@ -392,49 +383,32 @@ impl InMemoryLayer {
                };

                let slice = vec_map.slice_range(lsn_range);
+
                for (entry_lsn, pos) in slice.iter().rev() {
-                    planned_block_reads.push(BlockRead {
-                        key: *key,
-                        lsn: *entry_lsn,
-                        block_offset: *pos,
-                    });
+                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
+                    let buf = reader.read_blob(*pos, &ctx).await;
+                    if let Err(e) = buf {
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        break;
+                    }
+
+                    let value = Value::des(&buf.unwrap());
+                    if let Err(e) = value {
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        break;
+                    }
+
+                    let key_situation =
+                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
+                    if key_situation == ValueReconstructSituation::Complete {
+                        break;
+                    }
                }
            }
        }

-        let keyspace_size = keyspace.total_raw_size();
-
-        let mut completed_keys = HashSet::new();
-        while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
-            let block_read = planned_block_reads.pop().unwrap();
-            if completed_keys.contains(&block_read.key) {
-                continue;
-            }
-
-            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
-            if let Err(e) = buf {
-                reconstruct_state
-                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
-                completed_keys.insert(block_read.key);
-                continue;
-            }
-
-            let value = Value::des(&buf.unwrap());
-            if let Err(e) = value {
-                reconstruct_state
-                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
-                completed_keys.insert(block_read.key);
-                continue;
-            }
-
-            let key_situation =
-                reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
-            if key_situation == ValueReconstructSituation::Complete {
-                completed_keys.insert(block_read.key);
-            }
-        }
-
        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);

        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1,9 +1,7 @@
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::{
-    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
-};
+use pageserver_api::models::HistoricLayerInfo;
 use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
@@ -160,13 +158,10 @@ impl Layer {
            metadata.file_size,
        );

-        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
-
        let owner = Layer(Arc::new(LayerInner::new(
            conf,
            timeline,
            local_path,
-            access_stats,
            desc,
            None,
            metadata.generation,
@@ -193,8 +188,6 @@ impl Layer {
            metadata.file_size,
        );

-        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
-
        let mut resident = None;

        let owner = Layer(Arc::new_cyclic(|owner| {
@@ -209,7 +202,6 @@ impl Layer {
                conf,
                timeline,
                local_path,
-                access_stats,
                desc,
                Some(inner),
                metadata.generation,
@@ -245,11 +237,6 @@ impl Layer {
                version: 0,
            });
            resident = Some(inner.clone());
-            let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
-            access_stats.record_residence_event(
-                LayerResidenceStatus::Resident,
-                LayerResidenceEventReason::LayerCreate,
-            );

            let local_path = local_layer_path(
                conf,
@@ -259,16 +246,22 @@ impl Layer {
                &timeline.generation,
            );

-            LayerInner::new(
+            let layer = LayerInner::new(
                conf,
                timeline,
                local_path,
-                access_stats,
                desc,
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
-            )
+            );
+
+            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
+            layer
+                .access_stats
+                .set_visibility(super::LayerVisibilityHint::Visible);
+
+            layer
        }));

        let downloaded = resident.expect("just initialized");
@@ -332,9 +325,7 @@ impl Layer {
        use anyhow::ensure;

        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0
-            .access_stats
-            .record_access(LayerAccessKind::GetValueReconstructData, ctx);
+        self.0.access_stats.record_access(ctx);

        if self.layer_desc().is_delta {
            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
@@ -368,9 +359,7 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.0
-            .access_stats
-            .record_access(LayerAccessKind::GetValueReconstructData, ctx);
+        self.0.access_stats.record_access(ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -385,6 +374,7 @@ impl Layer {
    }

    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
+    #[allow(dead_code)]
    pub(crate) async fn load_key_values(
        &self,
        ctx: &RequestContext,
@@ -693,6 +683,18 @@ impl Drop for LayerInner {
            // and we could be delaying shutdown for nothing.
        }

+        if let Some(timeline) = self.timeline.upgrade() {
+            // Only need to decrement metrics if the timeline still exists: otherwise
+            // it will have already de-registered these metrics via TimelineMetrics::shutdown
+            if self.desc.is_delta() {
+                timeline.metrics.layer_count_delta.dec();
+                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
+            } else {
+                timeline.metrics.layer_count_image.dec();
+                timeline.metrics.layer_size_image.sub(self.desc.file_size);
+            }
+        }
+
        if !*self.wanted_deleted.get_mut() {
            return;
        }
@@ -773,7 +775,6 @@ impl LayerInner {
        conf: &'static PageServerConf,
        timeline: &Arc<Timeline>,
        local_path: Utf8PathBuf,
-        access_stats: LayerAccessStats,
        desc: PersistentLayerDesc,
        downloaded: Option<Arc<DownloadedLayer>>,
        generation: Generation,
@@ -791,6 +792,15 @@ impl LayerInner {
            (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
        };

+        // This object acts as a RAII guard on these metrics: increment on construction
+        if desc.is_delta() {
+            timeline.metrics.layer_count_delta.inc();
+            timeline.metrics.layer_size_delta.add(desc.file_size);
+        } else {
+            timeline.metrics.layer_count_image.inc();
+            timeline.metrics.layer_size_image.add(desc.file_size);
+        }
+
        LayerInner {
            conf,
            debug_str: {
@@ -799,7 +809,7 @@ impl LayerInner {
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
-            access_stats,
+            access_stats: Default::default(),
            wanted_deleted: AtomicBool::new(false),
            inner,
            version: AtomicUsize::new(version),
@@ -1154,10 +1164,7 @@ impl LayerInner {
                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                }

-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
+                self.access_stats.record_residence_event();

                Ok(self.initialize_after_layer_is_on_disk(permit))
            }
@@ -1276,7 +1283,7 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
@@ -1469,14 +1476,22 @@ impl LayerInner {
                let duration = SystemTime::now().duration_since(local_layer_mtime);
                match duration {
                    Ok(elapsed) => {
-                        timeline
-                            .metrics
-                            .evictions_with_low_residence_duration
-                            .read()
-                            .unwrap()
-                            .observe(elapsed);
+                        let accessed = self.access_stats.accessed();
+                        if accessed {
+                            // Only layers used for reads contribute to our "low residence" metric that is used
+                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
+                            // to be rapidly evicted without contributing to this metric.
+                            timeline
+                                .metrics
+                                .evictions_with_low_residence_duration
+                                .read()
+                                .unwrap()
+                                .observe(elapsed);
+                        }
+
                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
+                            accessed,
                            "evicted layer after known residence period"
                        );
                    }
@@ -1503,10 +1518,7 @@ impl LayerInner {
            }
        }

-        self.access_stats.record_residence_event(
-            LayerResidenceStatus::Evicted,
-            LayerResidenceEventReason::ResidenceChange,
-        );
+        self.access_stats.record_residence_event();

        self.status.as_ref().unwrap().send_replace(Status::Evicted);

@@ -1832,9 +1844,7 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                owner
-                    .access_stats
-                    .record_access(LayerAccessKind::KeyIter, ctx);
+                owner.access_stats.record_access(ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
@@ -1889,7 +1899,7 @@ impl ResidentLayer {
        self.owner.metadata()
    }

-    #[cfg(test)]
+    /// Cast the layer to a delta, return an error if it is an image layer.
    pub(crate) async fn get_as_delta(
        &self,
        ctx: &RequestContext,
@@ -1901,7 +1911,7 @@ impl ResidentLayer {
        }
    }

-    #[cfg(test)]
+    /// Cast the layer to an image, return an error if it is a delta layer.
    pub(crate) async fn get_as_image(
        &self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,3 +1,5 @@
+use std::time::UNIX_EPOCH;
+
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
 use utils::{
@@ -7,7 +9,7 @@ use utils::{

 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::context::DownloadBehavior;
+use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};

 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -22,7 +24,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 async fn smoke_test() {
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("smoke_test").unwrap();
+    let h = TenantHarness::create("smoke_test").await.unwrap();
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
    let (tenant, _) = h.load().await;
@@ -176,7 +178,9 @@ async fn evict_and_wait_on_wanted_deleted() {
    // this is the runtime on which Layer spawns the blocking tasks on
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
+        .await
+        .unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
    let (tenant, ctx) = h.load().await;

@@ -258,7 +262,9 @@ fn read_wins_pending_eviction() {
    rt.block_on(async move {
        // this is the runtime on which Layer spawns the blocking tasks on
        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
+        let h = TenantHarness::create("read_wins_pending_eviction")
+            .await
+            .unwrap();
        let (tenant, ctx) = h.load().await;
        let span = h.span();
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -390,7 +396,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
    rt.block_on(async move {
        // this is the runtime on which Layer spawns the blocking tasks on
        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create(name).unwrap();
+        let h = TenantHarness::create(name).await.unwrap();
        let (tenant, ctx) = h.load().await;
        let span = h.span();
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -559,8 +565,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
 #[tokio::test(start_paused = true)]
 async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let handle = tokio::runtime::Handle::current();
-    let h =
-        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+    let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
+        .await
+        .unwrap();
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
@@ -636,7 +643,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_does_not_wait_for_download() {
    // let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
+        .await
+        .unwrap();
    let (tenant, ctx) = h.load().await;
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -733,7 +742,9 @@ async fn eviction_cancellation_on_drop() {
    // this is the runtime on which Layer spawns the blocking tasks on
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    let h = TenantHarness::create("eviction_cancellation_on_drop")
+        .await
+        .unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
    let (tenant, ctx) = h.load().await;

@@ -817,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
 #[test]
 #[cfg(target_arch = "x86_64")]
 fn layer_size() {
-    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 312);
    // it also has the utf8 path
 }

@@ -959,3 +970,46 @@ fn spawn_blocking_pool_helper_actually_works() {
        println!("joined");
    });
 }
+
+/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats
+fn lowres_time(hires: SystemTime) -> SystemTime {
+    let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs();
+    UNIX_EPOCH + Duration::from_secs(ts)
+}
+
+#[test]
+fn access_stats() {
+    let access_stats = LayerAccessStats::default();
+    // Default is visible
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
+
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
+
+    let rtime = UNIX_EPOCH + Duration::from_secs(2000000000);
+    access_stats.record_residence_event_at(rtime);
+    assert_eq!(access_stats.latest_activity(), lowres_time(rtime));
+
+    let atime = UNIX_EPOCH + Duration::from_secs(2100000000);
+    access_stats.record_access_at(atime);
+    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+
+    // Setting visibility doesn't clobber access time
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+}
+
+#[test]
+fn access_stats_2038() {
+    // The access stats structure uses a timestamp representation that will run out
+    // of bits in 2038.  One year before that, this unit test will start failing.
+
+    let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap()
+        + Duration::from_secs(3600 * 24 * 365);
+
+    assert!(one_year_from_now.as_secs() < (2 << 31));
+}
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
    ///
    /// - For an open in-memory layer, the end bound is MAX_LSN
    /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
-    /// range start
+    ///   range start
    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
    pub lsn_range: Range<Lsn>,
    /// Whether this is a delta layer, and also, is this incremental.
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -248,6 +248,14 @@ impl LayerName {
            Image(_) => "image",
        }
    }
+
+    /// Gets the key range encoded in the layer name.
+    pub fn key_range(&self) -> &Range<Key> {
+        match &self {
+            LayerName::Image(layer) => &layer.key_range,
+            LayerName::Delta(layer) => &layer.key_range,
+        }
+    }
 }

 impl fmt::Display for LayerName {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
 impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        use std::cmp::Ordering;
-        let a = self.peek_next_key_lsn();
-        let b = other.peek_next_key_lsn();
+        let a = self.peek_next_key_lsn_value();
+        let b = other.peek_next_key_lsn_value();
        match (a, b) {
-            (Some((k1, l1)), Some((k2, l2))) => {
-                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
-                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
+            (Some((k1, l1, v1)), Some((k2, l2, v2))) => {
+                fn map_value_to_num(val: &Option<&Value>) -> usize {
+                    match val {
+                        None => 0,
+                        Some(Value::Image(_)) => 1,
+                        Some(Value::WalRecord(_)) => 2,
+                    }
+                }
+                let order_1 = map_value_to_num(&v1);
+                let order_2 = map_value_to_num(&v2);
                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
-                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
+                (k1, l1, order_1).cmp(&(k2, l2, order_2))
            }
            (Some(_), None) => Ordering::Less,
            (None, Some(_)) => Ordering::Greater,
@@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> {
        }
    }

-    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
+    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
        match self {
-            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
+            Self::Loaded { iter } => iter
+                .peek()
+                .as_ref()
+                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
            Self::NotLoaded {
                first_key_lower_bound: (key, lsn),
                ..
-            } => Some((key, *lsn)),
+            } => Some((key, *lsn, None)),
        }
    }

@@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> {
    }
 }

+/// A merge iterator over delta/image layer iterators. When duplicated records are
+/// found, the iterator will not perform any deduplication, and the caller should handle
+/// these situation. By saying duplicated records, there are many possibilities:
+/// * Two same delta at the same LSN.
+/// * Two same image at the same LSN.
+/// * Delta/image at the same LSN where the image has already applied the delta.
+/// The iterator will always put the image before the delta.
 pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
 }
@@ -245,8 +262,9 @@ mod tests {
    use crate::{
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
        },
+        walrecord::NeonWalRecord,
        DEFAULT_PG_VERSION,
    };

@@ -275,7 +293,9 @@ mod tests {
        use crate::repository::Value;
        use bytes::Bytes;

-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_merge_in_between")
+            .await
+            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
@@ -338,7 +358,9 @@ mod tests {
        use crate::repository::Value;
        use bytes::Bytes;

-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_delta_merge")
+            .await
+            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
@@ -407,6 +429,133 @@ mod tests {
        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
    }

-    // TODO: image layer merge, delta+image mixed merge
-    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
+    #[tokio::test]
+    async fn delta_image_mixed_merge() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        // In this test case, we want to test if the iterator still works correctly with multiple copies
+        // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
+        // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
+        // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
+        // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
+        // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
+        // correctly process these situations and return everything as-is, and the upper layer of the system
+        // will handle duplicated LSNs.
+        let test_deltas1 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(0),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("a")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("b")),
+            ),
+        ];
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas2 = test_deltas1.clone();
+        test_deltas2.push((
+            get_key(10),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas3 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::Image(Bytes::copy_from_slice(b"b")),
+            ),
+            (
+                get_key(15),
+                Lsn(0x20),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas4 = test_deltas3.clone();
+        test_deltas4.push((
+            get_key(20),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.extend(test_deltas3);
+        expect.extend(test_deltas4);
+        expect.sort_by(sort_delta_value);
+
+        // Test with different layer order for MergeIterator::create to ensure the order
+        // is stable.
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        is_send(merge_iter);
+    }
+
+    fn is_send(_: impl Send) {}
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -101,7 +101,6 @@ pub fn start_background_loops(
        Some(tenant_shard_id),
        None,
        &format!("compactor for tenant {tenant_shard_id}"),
-        false,
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -125,7 +124,6 @@ pub fn start_background_loops(
        Some(tenant_shard_id),
        None,
        &format!("garbage collector for tenant {tenant_shard_id}"),
-        false,
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -149,7 +147,6 @@ pub fn start_background_loops(
        Some(tenant_shard_id),
        None,
        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
-        false,
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,5 @@
 pub(crate) mod analysis;
-mod compaction;
+pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
@@ -69,6 +69,7 @@ use std::{
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
+        config::defaults::DEFAULT_PITR_INTERVAL,
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
        storage_layer::PersistentLayerDesc,
@@ -197,7 +198,7 @@ impl PartialOrd for Hole {

 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
+fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
    drop(rlock)
 }

@@ -270,7 +271,7 @@ pub struct Timeline {
    ///
    /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
-    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
+    pub(crate) layers: tokio::sync::RwLock<LayerManager>,

    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
@@ -459,7 +460,7 @@ pub(crate) struct GcInfo {
    /// Currently, this includes all points where child branches have
    /// been forked off from. In the future, could also include
    /// explicit user-defined snapshot points.
-    pub(crate) retain_lsns: Vec<Lsn>,
+    pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>,

    /// The cutoff coordinates, which are combined by selecting the minimum.
    pub(crate) cutoffs: GcCutoffs,
@@ -475,39 +476,43 @@ impl GcInfo {
    pub(crate) fn min_cutoff(&self) -> Lsn {
        self.cutoffs.select_min()
    }
+
+    pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
+        self.retain_lsns.push((child_lsn, child_id));
+        self.retain_lsns.sort_by_key(|i| i.0);
+    }
+
+    pub(super) fn remove_child(&mut self, child_id: TimelineId) {
+        self.retain_lsns.retain(|i| i.1 != child_id);
+    }
 }

-/// The `GcInfo` component describing which Lsns need to be retained.
-#[derive(Debug)]
+/// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
+/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
+/// between time-based and space-based retention for observability and consumption metrics purposes.
+#[derive(Debug, Clone)]
 pub(crate) struct GcCutoffs {
-    /// Keep everything newer than this point.
-    ///
-    /// This is calculated by subtracting 'gc_horizon' setting from
-    /// last-record LSN
-    ///
-    /// FIXME: is this inclusive or exclusive?
-    pub(crate) horizon: Lsn,
+    /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
+    /// history we must keep to retain a specified number of bytes of WAL.
+    pub(crate) space: Lsn,

-    /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
-    /// point.
-    ///
-    /// This is calculated by finding a number such that a record is needed for PITR
-    /// if only if its LSN is larger than 'pitr_cutoff'.
-    pub(crate) pitr: Lsn,
+    /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
+    /// history we must keep to enable reading back at least the PITR interval duration.
+    pub(crate) time: Lsn,
 }

 impl Default for GcCutoffs {
    fn default() -> Self {
        Self {
-            horizon: Lsn::INVALID,
-            pitr: Lsn::INVALID,
+            space: Lsn::INVALID,
+            time: Lsn::INVALID,
        }
    }
 }

 impl GcCutoffs {
    fn select_min(&self) -> Lsn {
-        std::cmp::min(self.horizon, self.pitr)
+        std::cmp::min(self.space, self.time)
    }
 }

@@ -866,7 +871,7 @@ impl Timeline {
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.pitr)
+            .checked_sub(gc_info.cutoffs.time)
            .unwrap_or(Lsn(0))
            .0;
        (history, gc_info.within_ancestor_pitr)
@@ -1565,7 +1570,7 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
-            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
+            "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
            lsn,
            **latest_gc_cutoff_lsn,
        );
@@ -2311,6 +2316,11 @@ impl Timeline {
            )
        };

+        if let Some(ancestor) = &ancestor {
+            let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
+            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn());
+        }
+
        Arc::new_cyclic(|myself| {
            let metrics = TimelineMetrics::new(
                &tenant_shard_id,
@@ -2481,7 +2491,6 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "layer flush task",
-            false,
            async move {
                let _guard = guard;
                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
@@ -2826,7 +2835,6 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "initial size calculation",
-            false,
            // NB: don't log errors here, task_mgr will do that.
            async move {
                let cancel = task_mgr::shutdown_token();
@@ -2995,7 +3003,6 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "ondemand logical size calculation",
-            false,
            async move {
                let res = self_clone
                    .logical_size_calculation_task(lsn, cause, &ctx)
@@ -3162,7 +3169,7 @@ impl Timeline {
        let guard = self.layers.read().await;

        let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.access_stats().latest_activity_or_now();
+            let last_activity_ts = layer.access_stats().latest_activity();

            HeatMapLayer::new(
                layer.layer_desc().layer_name(),
@@ -3408,6 +3415,8 @@ impl Timeline {
        }
    }

+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
+    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
    /// The algorithm is as follows:
@@ -4474,10 +4483,10 @@ impl Timeline {
    /// are required. Since checking if new image layers are required is expensive in
    /// terms of CPU, we only do it in the following cases:
    /// 1. If the timeline has ingested sufficient WAL to justify the cost
-    /// 2. If enough time has passed since the last check
-    /// 2.1. For large tenants, we wish to perform the check more often since they
-    /// suffer from the lack of image layers
-    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    /// 2. If enough time has passed since the last check:
+    ///     1. For large tenants, we wish to perform the check more often since they
+    ///        suffer from the lack of image layers
+    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;

@@ -4719,7 +4728,7 @@ impl Timeline {
    /// Requires a timeline that:
    /// - has an ancestor to detach from
    /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
-    /// a technical requirement
+    ///   a technical requirement
    ///
    /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
    /// polled again until completion.
@@ -4731,13 +4740,7 @@ impl Timeline {
        tenant: &crate::tenant::Tenant,
        options: detach_ancestor::Options,
        ctx: &RequestContext,
-    ) -> Result<
-        (
-            completion::Completion,
-            detach_ancestor::PreparedTimelineDetach,
-        ),
-        detach_ancestor::Error,
-    > {
+    ) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
        detach_ancestor::prepare(self, tenant, options, ctx).await
    }

@@ -4764,6 +4767,18 @@ impl Timeline {
    }
 }

+impl Drop for Timeline {
+    fn drop(&mut self) {
+        if let Some(ancestor) = &self.ancestor_timeline {
+            // This lock should never be poisoned, but in case it is we do a .map() instead of
+            // an unwrap(), to avoid panicking in a destructor and thereby aborting the process.
+            if let Ok(mut gc_info) = ancestor.gc_info.write() {
+                gc_info.remove_child(self.timeline_id)
+            }
+        }
+    }
+}
+
 /// Top-level failure to compact.
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CompactionError {
@@ -4876,7 +4891,7 @@ impl Timeline {
                // for compact_level0_phase1 creating an L0, which does not happen in practice
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(l.layer_desc()) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
                bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
            } else {
                insert_layers.push(l.clone());
@@ -4944,24 +4959,21 @@ impl Timeline {
    }

    /// Find the Lsns above which layer files need to be retained on
-    /// garbage collection. This is separate from actually performing the GC,
-    /// and is updated more frequently, so that compaction can remove obsolete
-    /// page versions more aggressively.
+    /// garbage collection.
    ///
-    /// TODO: that's wishful thinking, compaction doesn't actually do that
-    /// currently.
+    /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
+    /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
+    /// the space-based retention.
    ///
-    /// The 'cutoff_horizon' point is used to retain recent versions that might still be
-    /// needed by read-only nodes. (As of this writing, the caller just passes
-    /// the latest LSN subtracted by a constant, and doesn't do anything smart
-    /// to figure out what read-only nodes might actually need.)
-    ///
-    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
-    /// whether a record is needed for PITR.
+    /// This function doesn't simply to calculate time & space based retention: it treats time-based
+    /// retention as authoritative if enabled, and falls back to space-based retention if calculating
+    /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
+    /// be different to the `space_cutoff` input.  Callers should treat the min() of the two cutoffs
+    /// in the response as the GC cutoff point for the timeline.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
    pub(super) async fn find_gc_cutoffs(
        &self,
-        cutoff_horizon: Lsn,
+        space_cutoff: Lsn,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
@@ -4974,58 +4986,87 @@ impl Timeline {

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

-        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
-        //
-        // Some unit tests depend on garbage-collection working even when
-        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
-        // work, so avoid calling it altogether if time-based retention is not
-        // configured. It would be pointless anyway.
-        let pitr_cutoff = if pitr != Duration::ZERO {
-            let now = SystemTime::now();
-            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
-                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
-
-                match self
-                    .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
-                    .await?
-                {
-                    LsnForTimestamp::Present(lsn) => lsn,
-                    LsnForTimestamp::Future(lsn) => {
-                        // The timestamp is in the future. That sounds impossible,
-                        // but what it really means is that there hasn't been
-                        // any commits since the cutoff timestamp.
-                        //
-                        // In this case we should use the LSN of the most recent commit,
-                        // which is implicitly the last LSN in the log.
-                        debug!("future({})", lsn);
-                        self.get_last_record_lsn()
-                    }
-                    LsnForTimestamp::Past(lsn) => {
-                        debug!("past({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                    LsnForTimestamp::NoData(lsn) => {
-                        debug!("nodata({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                }
-            } else {
-                // If we don't have enough data to convert to LSN,
-                // play safe and don't remove any layers.
-                *self.get_latest_gc_cutoff_lsn()
+        if cfg!(test) {
+            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
+            if pitr == Duration::ZERO {
+                return Ok(GcCutoffs {
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
+                });
+            }
+        }
+
+        // Calculate a time-based limit on how much to retain:
+        // - if PITR interval is set, then this is our cutoff.
+        // - if PITR interval is not set, then we do a lookup
+        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
+        let time_cutoff = {
+            let now = SystemTime::now();
+            let time_range = if pitr == Duration::ZERO {
+                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            } else {
+                pitr
+            };
+
+            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
+            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
+            let timestamp = to_pg_timestamp(time_cutoff);
+
+            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
+                LsnForTimestamp::Present(lsn) => Some(lsn),
+                LsnForTimestamp::Future(lsn) => {
+                    // The timestamp is in the future. That sounds impossible,
+                    // but what it really means is that there hasn't been
+                    // any commits since the cutoff timestamp.
+                    //
+                    // In this case we should use the LSN of the most recent commit,
+                    // which is implicitly the last LSN in the log.
+                    debug!("future({})", lsn);
+                    Some(self.get_last_record_lsn())
+                }
+                LsnForTimestamp::Past(lsn) => {
+                    debug!("past({})", lsn);
+                    None
+                }
+                LsnForTimestamp::NoData(lsn) => {
+                    debug!("nodata({})", lsn);
+                    None
+                }
            }
-        } else {
-            // No time-based retention was configured. Interpret this as "keep no history".
-            self.get_last_record_lsn()
        };

-        Ok(GcCutoffs {
-            horizon: cutoff_horizon,
-            pitr: pitr_cutoff,
+        Ok(match (pitr, time_cutoff) {
+            (Duration::ZERO, Some(time_cutoff)) => {
+                // PITR is not set. Retain the size-based limit, or the default time retention,
+                // whichever requires less data.
+                GcCutoffs {
+                    time: self.get_last_record_lsn(),
+                    space: std::cmp::max(time_cutoff, space_cutoff),
+                }
+            }
+            (Duration::ZERO, None) => {
+                // PITR is not set, and time lookup failed
+                GcCutoffs {
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
+                }
+            }
+            (_, None) => {
+                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
+                // cannot advance beyond what was already GC'd, and respect space-based retention
+                GcCutoffs {
+                    time: *self.get_latest_gc_cutoff_lsn(),
+                    space: space_cutoff,
+                }
+            }
+            (_, Some(time_cutoff)) => {
+                // PITR interval is set and we looked up timestamp successfully.  Ignore
+                // size based retention and make time cutoff authoritative
+                GcCutoffs {
+                    time: time_cutoff,
+                    space: time_cutoff,
+                }
+            }
        })
    }

@@ -5050,12 +5091,16 @@ impl Timeline {
            return Err(GcError::TimelineCancelled);
        }

-        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
+        let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
            let gc_info = self.gc_info.read().unwrap();

-            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
-            let pitr_cutoff = gc_info.cutoffs.pitr;
-            let retain_lsns = gc_info.retain_lsns.clone();
+            let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
+            let time_cutoff = gc_info.cutoffs.time;
+            let retain_lsns = gc_info
+                .retain_lsns
+                .iter()
+                .map(|(lsn, _child_id)| *lsn)
+                .collect();

            // Gets the maximum LSN that holds the valid lease.
            //
@@ -5064,14 +5109,14 @@ impl Timeline {
            let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);

            (
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
            )
        };

-        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
        let standby_horizon = self.standby_horizon.load();
        // Hold GC for the standby, but as a safety guard do it only within some
        // reasonable lag.
@@ -5100,8 +5145,8 @@ impl Timeline {

        let res = self
            .gc_timeline(
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
                new_gc_cutoff,
@@ -5119,8 +5164,8 @@ impl Timeline {

    async fn gc_timeline(
        &self,
-        horizon_cutoff: Lsn,
-        pitr_cutoff: Lsn,
+        space_cutoff: Lsn,
+        time_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
@@ -5181,22 +5226,22 @@ impl Timeline {
            result.layers_total += 1;

            // 1. Is it newer than GC horizon cutoff point?
-            if l.get_lsn_range().end > horizon_cutoff {
+            if l.get_lsn_range().end > space_cutoff {
                debug!(
-                    "keeping {} because it's newer than horizon_cutoff {}",
+                    "keeping {} because it's newer than space_cutoff {}",
                    l.layer_name(),
-                    horizon_cutoff,
+                    space_cutoff,
                );
                result.layers_needed_by_cutoff += 1;
                continue 'outer;
            }

            // 2. It is newer than PiTR cutoff point?
-            if l.get_lsn_range().end > pitr_cutoff {
+            if l.get_lsn_range().end > time_cutoff {
                debug!(
-                    "keeping {} because it's newer than pitr_cutoff {}",
+                    "keeping {} because it's newer than time_cutoff {}",
                    l.layer_name(),
-                    pitr_cutoff,
+                    time_cutoff,
                );
                result.layers_needed_by_pitr += 1;
                continue 'outer;
@@ -5417,7 +5462,6 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "download all remote layers task",
-            false,
            async move {
                self_clone.download_all_remote_layers(request).await;
                let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
@@ -5568,7 +5612,7 @@ impl Timeline {
                let file_size = layer.layer_desc().file_size;
                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));

-                let last_activity_ts = layer.access_stats().latest_activity_or_now();
+                let last_activity_ts = layer.access_stats().latest_activity();

                EvictionCandidate {
                    layer: layer.into(),
@@ -6028,8 +6072,9 @@ mod tests {

    #[tokio::test]
    async fn two_layer_eviction_attempts_at_the_same_time() {
-        let harness =
-            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
+        let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
+            .await
+            .unwrap();

        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,15 +26,17 @@ use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
-use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
+use crate::tenant::storage_layer::merge_iterator::MergeIterator;
+use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
+use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 use crate::keyspace::KeySpace;
-use crate::repository::Key;
+use crate::repository::{Key, Value};

 use utils::lsn::Lsn;

@@ -43,6 +45,60 @@ use pageserver_compaction::interface::*;

 use super::CompactionError;

+/// Maximum number of deltas before generating an image layer in bottom-most compaction.
+const COMPACTION_DELTA_THRESHOLD: usize = 5;
+
+/// The result of bottom-most compaction for a single key at each LSN.
+#[derive(Debug)]
+#[cfg_attr(test, derive(PartialEq))]
+pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>);
+
+/// The result of bottom-most compaction.
+#[derive(Debug)]
+#[cfg_attr(test, derive(PartialEq))]
+pub(crate) struct KeyHistoryRetention {
+    /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN.
+    pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>,
+    /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN.
+    pub(crate) above_horizon: KeyLogAtLsn,
+}
+
+impl KeyHistoryRetention {
+    async fn pipe_to(
+        self,
+        key: Key,
+        delta_writer: &mut Vec<(Key, Lsn, Value)>,
+        image_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut first_batch = true;
+        for (_, KeyLogAtLsn(logs)) in self.below_horizon {
+            if first_batch {
+                if logs.len() == 1 && logs[0].1.is_image() {
+                    let Value::Image(img) = &logs[0].1 else {
+                        unreachable!()
+                    };
+                    image_writer.put_image(key, img.clone(), ctx).await?;
+                } else {
+                    for (lsn, val) in logs {
+                        delta_writer.push((key, lsn, val));
+                    }
+                }
+                first_batch = false;
+            } else {
+                for (lsn, val) in logs {
+                    delta_writer.push((key, lsn, val));
+                }
+            }
+        }
+        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
+        for (lsn, val) in above_horizon_logs {
+            delta_writer.push((key, lsn, val));
+        }
+        Ok(())
+    }
+}
+
 impl Timeline {
    /// TODO: cancellation
    pub(crate) async fn compact_legacy(
@@ -195,7 +251,7 @@ impl Timeline {
        tracing::info!(
            "latest_gc_cutoff: {}, pitr cutoff {}",
            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.pitr
+            self.gc_info.read().unwrap().cutoffs.time
        );

        let layers = self.layers.read().await;
@@ -379,7 +435,7 @@ impl Timeline {
            };

            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let phase1_layers_locked = self.layers.read().await;
            let now = tokio::time::Instant::now();
            stats.read_lock_acquisition_micros =
                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -399,9 +455,9 @@ impl Timeline {
    }

    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+    async fn compact_level0_phase1<'a>(
+        self: &'a Arc<Self>,
+        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        ctx: &RequestContext,
@@ -415,6 +471,7 @@ impl Timeline {
            .map(|x| guard.get_from_desc(&x))
            .collect_vec();
        stats.level0_deltas_count = Some(level0_deltas.len());
+
        // Only compact if enough layers have accumulated.
        let threshold = self.get_compaction_threshold();
        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -445,6 +502,22 @@ impl Timeline {
        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());

+        // Accumulate the size of layers in `deltas_to_compact`
+        let mut deltas_to_compact_bytes = 0;
+
+        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
+        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
+        // work in this function to only operate on this much delta data at once.
+        //
+        // Take the max of the configured value & the default, so that tests that configure tiny values
+        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
+        // still let them compact a full stack of L0s in one go.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_threshold(),
+            DEFAULT_COMPACTION_THRESHOLD,
+        ) as u64
+            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
+
        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;
@@ -453,7 +526,20 @@ impl Timeline {
                break;
            }
            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;
+
+            if deltas_to_compact_bytes >= delta_size_limit {
+                info!(
+                    l0_deltas_selected = deltas_to_compact.len(),
+                    l0_deltas_total = level0_deltas.len(),
+                    "L0 compaction picker hit max delta layer size limit: {}",
+                    delta_size_limit
+                );
+
+                // Proceed with compaction, but only a subset of L0s
+                break;
+            }
        }
        let lsn_range = Range {
            start: deltas_to_compact
@@ -957,6 +1043,188 @@ impl Timeline {
        Ok(())
    }

+    /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns.
+    ///
+    /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon.
+    /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is
+    /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch.
+    ///
+    /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have:
+    ///
+    /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60
+    /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3
+    ///
+    /// The function will produce:
+    ///
+    /// ```plain
+    /// 0x20(retain_lsn) -> img=AB@0x20                  always produce a single image below the lowest retain LSN
+    /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40]    two deltas since the last base image, keeping the deltas
+    /// 0x50(horizon)    -> deltas=[ABCDE@0x50]          three deltas since the last base image, generate an image but put it in the delta
+    /// above_horizon    -> deltas=[+F@0x60]             full history above the horizon
+    /// ```
+    ///
+    /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
+    pub(crate) async fn generate_key_retention(
+        self: &Arc<Timeline>,
+        key: Key,
+        history: &[(Key, Lsn, Value)],
+        horizon: Lsn,
+        retain_lsn_below_horizon: &[Lsn],
+        delta_threshold_cnt: usize,
+    ) -> anyhow::Result<KeyHistoryRetention> {
+        // Pre-checks for the invariants
+        if cfg!(debug_assertions) {
+            for (log_key, _, _) in history {
+                assert_eq!(log_key, &key, "mismatched key");
+            }
+            for i in 1..history.len() {
+                assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
+                if history[i - 1].1 == history[i].1 {
+                    assert!(
+                        matches!(history[i - 1].2, Value::Image(_)),
+                        "unordered delta/image, or duplicated delta"
+                    );
+                }
+            }
+            if let Value::WalRecord(rec) = &history[0].2 {
+                assert!(rec.will_init(), "no base image");
+            }
+            for lsn in retain_lsn_below_horizon {
+                assert!(lsn < &horizon, "retain lsn must be below horizon")
+            }
+            for i in 1..retain_lsn_below_horizon.len() {
+                assert!(
+                    retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i],
+                    "unordered LSN"
+                );
+            }
+        }
+        // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
+        // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
+        let (mut split_history, lsn_split_points) = {
+            let mut split_history = Vec::new();
+            split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new);
+            let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1);
+            for lsn in retain_lsn_below_horizon {
+                lsn_split_points.push(*lsn);
+            }
+            lsn_split_points.push(horizon);
+            let mut current_idx = 0;
+            for item @ (_, lsn, _) in history {
+                while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
+                    current_idx += 1;
+                }
+                split_history[current_idx].push(item);
+            }
+            (split_history, lsn_split_points)
+        };
+        // Step 2: filter out duplicated records due to the k-merge of image/delta layers
+        for split_for_lsn in &mut split_history {
+            let mut prev_lsn = None;
+            let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len());
+            for record @ (_, lsn, _) in std::mem::take(split_for_lsn) {
+                if let Some(prev_lsn) = &prev_lsn {
+                    if *prev_lsn == lsn {
+                        // The case that we have an LSN with both data from the delta layer and the image layer. As
+                        // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
+                        // drop this delta and keep the image.
+                        //
+                        // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
+                        // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
+                        // dropped.
+                        continue;
+                    }
+                }
+                prev_lsn = Some(lsn);
+                new_split_for_lsn.push(record);
+            }
+            *split_for_lsn = new_split_for_lsn;
+        }
+        // Step 3: generate images when necessary
+        let mut retention = Vec::with_capacity(split_history.len());
+        let mut records_since_last_image = 0;
+        let batch_cnt = split_history.len();
+        assert!(
+            batch_cnt >= 2,
+            "should have at least below + above horizon batches"
+        );
+        let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
+        for (i, split_for_lsn) in split_history.into_iter().enumerate() {
+            records_since_last_image += split_for_lsn.len();
+            let generate_image = if i == 0 {
+                // We always generate images for the first batch (below horizon / lowest retain_lsn)
+                true
+            } else if i == batch_cnt - 1 {
+                // Do not generate images for the last batch (above horizon)
+                false
+            } else if records_since_last_image >= delta_threshold_cnt {
+                // Generate images when there are too many records
+                true
+            } else {
+                false
+            };
+            replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
+            if let Some((_, _, val)) = replay_history.first() {
+                assert!(val.will_init(), "invalid history, no base image");
+            }
+            // Only retain the items after the last image record
+            for idx in (0..replay_history.len()).rev() {
+                if replay_history[idx].2.will_init() {
+                    replay_history = replay_history[idx..].to_vec();
+                    break;
+                }
+            }
+            if generate_image && records_since_last_image > 0 {
+                records_since_last_image = 0;
+                let history = std::mem::take(&mut replay_history);
+                let mut img = None;
+                let mut records = Vec::with_capacity(history.len());
+                if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
+                    img = Some((*lsn, val.clone()));
+                    for (_, lsn, val) in history.into_iter().skip(1) {
+                        let Value::WalRecord(rec) = val else {
+                            panic!("invalid record")
+                        };
+                        records.push((lsn, rec));
+                    }
+                } else {
+                    for (_, lsn, val) in history.into_iter() {
+                        let Value::WalRecord(rec) = val else {
+                            panic!("invalid record")
+                        };
+                        records.push((lsn, rec));
+                    }
+                }
+                records.reverse();
+                let state = ValueReconstructState { img, records };
+                let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
+                let img = self.reconstruct_value(key, request_lsn, state).await?;
+                replay_history.push((key, request_lsn, Value::Image(img.clone())));
+                retention.push(vec![(request_lsn, Value::Image(img))]);
+            } else {
+                retention.push(
+                    split_for_lsn
+                        .iter()
+                        .map(|(_, lsn, value)| (*lsn, value.clone()))
+                        .collect(),
+                );
+            }
+        }
+        let mut result = Vec::with_capacity(retention.len());
+        assert_eq!(retention.len(), lsn_split_points.len() + 1);
+        for (idx, logs) in retention.into_iter().enumerate() {
+            if idx == lsn_split_points.len() {
+                return Ok(KeyHistoryRetention {
+                    below_horizon: result,
+                    above_horizon: KeyLogAtLsn(logs),
+                });
+            } else {
+                result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
+            }
+        }
+        unreachable!()
+    }
+
    /// An experimental compaction building block that combines compaction with garbage collection.
    ///
    /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -968,7 +1236,6 @@ impl Timeline {
        _cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<(), CompactionError> {
-        use crate::tenant::storage_layer::ValueReconstructState;
        use std::collections::BTreeSet;

        info!("running enhanced gc bottom-most compaction");
@@ -981,37 +1248,60 @@ impl Timeline {
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff) = {
+        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let gc_info = self.gc_info.read().unwrap();
-            if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
-                return Err(CompactionError::Other(anyhow!(
-                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
-                )));
+            let mut retain_lsns_below_horizon = Vec::new();
+            let gc_cutoff = gc_info.cutoffs.select_min();
+            for (lsn, _timeline_id) in &gc_info.retain_lsns {
+                if lsn < &gc_cutoff {
+                    retain_lsns_below_horizon.push(*lsn);
+                }
+            }
+            for lsn in gc_info.leases.keys() {
+                if lsn < &gc_cutoff {
+                    retain_lsns_below_horizon.push(*lsn);
+                }
            }
-            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
            let mut selected_layers = Vec::new();
-            // TODO: consider retain_lsns
            drop(gc_info);
            for desc in layers.iter_historic_layers() {
                if desc.get_lsn_range().start <= gc_cutoff {
                    selected_layers.push(guard.get_from_desc(&desc));
                }
            }
-            (selected_layers, gc_cutoff)
+            retain_lsns_below_horizon.sort();
+            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
        };
+        let lowest_retain_lsn = retain_lsns_below_horizon
+            .first()
+            .copied()
+            .unwrap_or(gc_cutoff);
+        if cfg!(debug_assertions) {
+            assert_eq!(
+                lowest_retain_lsn,
+                retain_lsns_below_horizon
+                    .iter()
+                    .min()
+                    .copied()
+                    .unwrap_or(gc_cutoff)
+            );
+        }
        info!(
-            "picked {} layers for compaction with gc_cutoff={}",
+            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
            layer_selection.len(),
-            gc_cutoff
+            gc_cutoff,
+            lowest_retain_lsn
        );
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
        // Also, collect the layer information to decide when to split the new delta layers.
-        let mut all_key_values = Vec::new();
+        let mut downloaded_layers = Vec::new();
        let mut delta_split_points = BTreeSet::new();
        for layer in &layer_selection {
-            all_key_values.extend(layer.load_key_values(ctx).await?);
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+
            let desc = layer.layer_desc();
            if desc.is_delta() {
                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
@@ -1021,86 +1311,22 @@ impl Timeline {
                delta_split_points.insert(key_range.end);
            }
        }
-        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
-        // image layers, make image appear before than delta.
-        struct ValueWrapper<'a>(&'a crate::repository::Value);
-        impl Ord for ValueWrapper<'_> {
-            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-                use crate::repository::Value;
-                use std::cmp::Ordering;
-                match (self.0, other.0) {
-                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
-                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
-                    _ => Ordering::Equal,
-                }
+        let mut delta_layers = Vec::new();
+        let mut image_layers = Vec::new();
+        for resident_layer in &downloaded_layers {
+            if resident_layer.layer_desc().is_delta() {
+                let layer = resident_layer.get_as_delta(ctx).await?;
+                delta_layers.push(layer);
+            } else {
+                let layer = resident_layer.get_as_image(ctx).await?;
+                image_layers.push(layer);
            }
        }
-        impl PartialOrd for ValueWrapper<'_> {
-            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-                Some(self.cmp(other))
-            }
-        }
-        impl PartialEq for ValueWrapper<'_> {
-            fn eq(&self, other: &Self) -> bool {
-                self.cmp(other) == std::cmp::Ordering::Equal
-            }
-        }
-        impl Eq for ValueWrapper<'_> {}
-        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
-            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
-        });
+        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
        // Data of the same key.
        let mut accumulated_values = Vec::new();
-        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
-
-        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
-        async fn flush_accumulated_states(
-            tline: &Arc<Timeline>,
-            key: Key,
-            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
-            horizon: Lsn,
-        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
-            let mut base_image = None;
-            let mut keys_above_horizon = Vec::new();
-            let mut delta_above_base_image = Vec::new();
-            // We have a list of deltas/images. We want to create image layers while collect garbages.
-            for (key, lsn, val) in accumulated_values.iter().rev() {
-                if *lsn > horizon {
-                    if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() {
-                        if *prev_lsn == *lsn {
-                            // The case that we have an LSN with both data from the delta layer and the image layer. As
-                            // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
-                            // drop this delta and keep the image.
-                            //
-                            // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
-                            // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
-                            // dropped.
-                            continue;
-                        }
-                    }
-                    keys_above_horizon.push((*key, *lsn, val.clone()));
-                } else if *lsn <= horizon {
-                    match val {
-                        crate::repository::Value::Image(image) => {
-                            base_image = Some((*lsn, image.clone()));
-                            break;
-                        }
-                        crate::repository::Value::WalRecord(wal) => {
-                            delta_above_base_image.push((*lsn, wal.clone()));
-                        }
-                    }
-                }
-            }
-            // do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
-            keys_above_horizon.reverse();
-            let state = ValueReconstructState {
-                img: base_image,
-                records: delta_above_base_image,
-            };
-            let img = tline.reconstruct_value(key, horizon, state).await?;
-            Ok((keys_above_horizon, img))
-        }
+        let mut last_key: Option<Key> = None;

        async fn flush_deltas(
            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
@@ -1108,7 +1334,7 @@ impl Timeline {
            delta_split_points: &[Key],
            current_delta_split_point: &mut usize,
            tline: &Arc<Timeline>,
-            gc_cutoff: Lsn,
+            lowest_retain_lsn: Lsn,
            ctx: &RequestContext,
        ) -> anyhow::Result<Option<ResidentLayer>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1143,7 +1369,7 @@ impl Timeline {
                tline.timeline_id,
                tline.tenant_shard_id,
                deltas.first().unwrap().0,
-                gc_cutoff..end_lsn,
+                lowest_retain_lsn..end_lsn,
                ctx,
            )
            .await?;
@@ -1159,8 +1385,8 @@ impl Timeline {
            self.conf,
            self.timeline_id,
            self.tenant_shard_id,
-            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
-            gc_cutoff,
+            &(Key::MIN..Key::MAX), // covers the full key range
+            lowest_retain_lsn,
            ctx,
        )
        .await?;
@@ -1169,40 +1395,60 @@ impl Timeline {
        let delta_split_points = delta_split_points.into_iter().collect_vec();
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
-        for item @ (key, _, _) in &all_key_values {
-            if &last_key == key {
-                accumulated_values.push(item);
+        while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if last_key.is_none() || last_key.as_ref() == Some(&key) {
+                if last_key.is_none() {
+                    last_key = Some(key);
+                }
+                accumulated_values.push((key, lsn, val));
            } else {
-                let (deltas, image) =
-                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
-                        .await?;
+                let last_key = last_key.as_mut().unwrap();
+                let retention = self
+                    .generate_key_retention(
+                        *last_key,
+                        &accumulated_values,
+                        gc_cutoff,
+                        &retain_lsns_below_horizon,
+                        COMPACTION_DELTA_THRESHOLD,
+                    )
+                    .await?;
                // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                image_layer_writer.put_image(last_key, image, ctx).await?;
-                delta_values.extend(deltas);
+                retention
+                    .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
+                    .await?;
                delta_layers.extend(
                    flush_deltas(
                        &mut delta_values,
-                        last_key,
+                        *last_key,
                        &delta_split_points,
                        &mut current_delta_split_point,
                        self,
-                        gc_cutoff,
+                        lowest_retain_lsn,
                        ctx,
                    )
                    .await?,
                );
                accumulated_values.clear();
-                accumulated_values.push(item);
-                last_key = *key;
+                *last_key = key;
+                accumulated_values.push((key, lsn, val));
            }
        }

+        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
-        let (deltas, image) =
-            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
+        let retention = self
+            .generate_key_retention(
+                last_key,
+                &accumulated_values,
+                gc_cutoff,
+                &retain_lsns_below_horizon,
+                COMPACTION_DELTA_THRESHOLD,
+            )
+            .await?;
        // Put the image into the image layer. Currently we have a single big layer for the compaction.
-        image_layer_writer.put_image(last_key, image, ctx).await?;
-        delta_values.extend(deltas);
+        retention
+            .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
+            .await?;
        delta_layers.extend(
            flush_deltas(
                &mut delta_values,
@@ -1210,7 +1456,7 @@ impl Timeline {
                &delta_split_points,
                &mut current_delta_split_point,
                self,
-                gc_cutoff,
+                lowest_retain_lsn,
                ctx,
            )
            .await?,
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces(
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
    tenant: &Tenant,
-    timeline_id: TimelineId,
+    timeline: &Timeline,
    _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
    // Remove the timeline from the map.
    let mut timelines = tenant.timelines.lock().unwrap();
    let children_exist = timelines
        .iter()
-        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
    // We already deleted the layer files, so it's probably best to panic.
    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
@@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant(
    }

    timelines
-        .remove(&timeline_id)
+        .remove(&timeline.timeline_id)
        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");

    drop(timelines);
@@ -182,13 +182,15 @@ async fn remove_timeline_from_tenant(
 /// 5. Delete index part
 /// 6. Delete meta, timeline directory
 /// 7. Delete mark file
+///
 /// It is resumable from any step in case a crash/restart occurs.
 /// There are three entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-/// and we possibly neeed to continue deletion of remote files.
+///    and we possibly neeed to continue deletion of remote files.
 /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-/// index but still have local metadata, timeline directory and delete mark.
+///    index but still have local metadata, timeline directory and delete mark.
+///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
 pub enum DeleteTimelineFlow {
@@ -389,7 +391,6 @@ impl DeleteTimelineFlow {
            Some(tenant_shard_id),
            Some(timeline_id),
            "timeline_delete",
-            false,
            async move {
                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
                    error!("Error: {err:#}");
@@ -413,7 +414,7 @@ impl DeleteTimelineFlow {

        pausable_failpoint!("in_progress_delete");

-        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
+        remove_timeline_from_tenant(tenant, timeline, &guard).await?;

        *guard = Self::Finished;

--- a/Show More
+++ b/Show More