Merge pull request #8451 from neondatabase/rc/2024-07-22

## Storage & Compute release 2024-07-22 This PR has so many commits because the release branch diverged from `main`. Details https://neondb.slack.com/archives/C033A2WE6BZ/p1721650938949059?thread_ts=1721308848.034069&cid=C033A2WE6BZ The commit range that is truly new since the last storage release are the the `main` commit which I cherry-picked using this command ``` git cherry-pick 8a8b83df27383a07bb7dbba519325c15d2f46357..4e547e6 ```
Merge commit '4e547e6' into rc/2024-07-22
2026-01-23 21:30:36 +00:00 · 2024-07-22 19:17:01 +02:00 · 2024-07-22 14:40:55 +02:00 · 2024-07-22 14:36:56 +02:00 · 2024-07-22 14:36:56 +02:00 · 2024-07-22 14:36:56 +02:00
118 changed files with 2532 additions and 3780 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +0,0 @@
-# allows for nicer hunk headers with git show
-*.rs diff=rust
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -1,291 +0,0 @@
-name: Build and Test Locally
-
-on:
-  workflow_call:
-    inputs:
-      arch:
-        description: 'x64 or arm64'
-        required: true
-        type: string
-      build-tag:
-        description: 'build tag'
-        required: true
-        type: string
-      build-tools-image:
-        description: 'build-tools image'
-        required: true
-        type: string
-      build-type:
-        description: 'debug or release'
-        required: true
-        type: string
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-env:
-  RUST_BACKTRACE: 1
-  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-jobs:
-  build-neon:
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
-    container:
-      image: ${{ inputs.build-tools-image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # Raise locked memory limit for tokio-epoll-uring.
-      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
-      # io_uring will account the memory of the CQ and SQ as locked.
-      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    env:
-      BUILD_TYPE: ${{ inputs.build-type }}
-      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
-      BUILD_TAG: ${{ inputs.build-tag }}
-
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-
-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
-      # Set some environment variables used by all the steps.
-      #
-      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
-      #   It also includes --features, if any
-      #
-      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
-      #   because "cargo metadata" doesn't accept --release or --debug options
-      #
-      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
-      # corresponding Cargo.toml files for their descriptions.
-      - name: Set env variables
-        run: |
-          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FLAGS="--locked"
-          elif [[ $BUILD_TYPE == "release" ]]; then
-            cov_prefix=""
-            CARGO_FLAGS="--locked --release"
-          fi
-          {
-            echo "cov_prefix=${cov_prefix}"
-            echo "CARGO_FEATURES=${CARGO_FEATURES}"
-            echo "CARGO_FLAGS=${CARGO_FLAGS}"
-            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
-          } >> $GITHUB_ENV
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
-
-      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
-
-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
-      - name: Run cargo build
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
-
-      # Do install *before* running rust tests because they might recompile the
-      # binaries with different features/flags.
-      - name: Install rust binaries
-        run: |
-          # Install target binaries
-          mkdir -p /tmp/neon/bin/
-          binaries=$(
-            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
-            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
-          )
-          for bin in $binaries; do
-            SRC=target/$BUILD_TYPE/$bin
-            DST=/tmp/neon/bin/$bin
-            cp "$SRC" "$DST"
-          done
-
-          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            # Keep bloated coverage data files away from the rest of the artifact
-            mkdir -p /tmp/coverage/
-
-            mkdir -p /tmp/neon/test_bin/
-
-            test_exe_paths=$(
-              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
-              jq -r '.executable | select(. != null)'
-            )
-            for bin in $test_exe_paths; do
-              SRC=$bin
-              DST=/tmp/neon/test_bin/$(basename $bin)
-
-              # We don't need debug symbols for code coverage, so strip them out to make
-              # the artifact smaller.
-              strip "$SRC" -o "$DST"
-              echo "$DST" >> /tmp/coverage/binaries.list
-            done
-
-            for bin in $binaries; do
-              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
-            done
-          fi
-
-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
-      - name: Install postgres binaries
-        run: cp -a pg_install /tmp/neon/pg_install
-
-      - name: Upload Neon artifact
-        uses: ./.github/actions/upload
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
-          path: /tmp/neon
-
-      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
-      - name: Merge and upload coverage data
-        if: inputs.build-type == 'debug'
-        uses: ./.github/actions/save-coverage-data
-
-  regress-tests:
-    # Run test on x64 only
-    if: inputs.arch == 'x64'
-    needs: [ build-neon ]
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
-    container:
-      image: ${{ inputs.build-tools-image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    strategy:
-      fail-fast: false
-      matrix:
-        pg_version: [ v14, v15, v16 ]
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Pytest regression tests
-        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
-        with:
-          build_type: ${{ inputs.build-type }}
-          test_selection: regress
-          needs_postgres_source: true
-          run_with_real_s3: true
-          real_s3_bucket: neon-github-ci-tests
-          real_s3_region: eu-central-1
-          rerun_flaky: true
-          pg_version: ${{ matrix.pg_version }}
-        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
-          BUILD_TAG: ${{ inputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true
-
-      # Temporary disable this step until we figure out why it's so flaky
-      # Ref https://github.com/neondatabase/neon/issues/4540
-      - name: Merge and upload coverage data
-        if: |
-          false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
-        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -261,7 +261,8 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -48,30 +48,12 @@ jobs:

  tag:
    needs: [ check-permissions ]
-    runs-on: ubuntu-22.04
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}
-    permissions:
-      id-token: write
-      contents: read

    steps:
-      # - name: Install az cli
-      #   run: |
-      #     curl -sL https://aka.ms/InstallAzureCLIDeb --output-dir /tmp -OJ
-      #     bash /tmp/InstallAzureCLIDeb
-      #
-      - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_RUNNER_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: push
-        run: |
-          az acr login --name neoneastus2
-          docker buildx imagetools create -t neoneastus2.azurecr.io/neondatabase/neon:5718 neondatabase/neon:5718
-
      - name: Checkout
        uses: actions/checkout@v4
        with:
@@ -143,11 +125,7 @@ jobs:

  check-codestyle-rust:
    needs: [ check-permissions, build-build-tools-image ]
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
-
+    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
      credentials:
@@ -215,27 +193,291 @@ jobs:
        if: ${{ !cancelled() }}
        run: cargo deny check --hide-inclusion-graph

-  build-and-test-locally:
-    needs: [ tag, build-build-tools-image ]
+  build-neon:
+    needs: [ check-permissions, tag, build-build-tools-image ]
+    runs-on: [ self-hosted, gen3, large ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
      fail-fast: false
      matrix:
-        arch: [ x64 ]
-        build-type: [ debug, release ]
-        include:
-          - build-type: release
-            arch: arm64
-    uses: ./.github/workflows/_build-and-test-locally.yml
-    with:
-      arch: ${{ matrix.arch }}
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
-      build-tag: ${{ needs.tag.outputs.build-tag }}
-      build-type: ${{ matrix.build-type }}
-    secrets: inherit
+        build_type: [ debug, release ]
+    env:
+      BUILD_TYPE: ${{ matrix.build_type }}
+      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      # Set some environment variables used by all the steps.
+      #
+      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
+      #   It also includes --features, if any
+      #
+      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
+      #   because "cargo metadata" doesn't accept --release or --debug options
+      #
+      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
+      # corresponding Cargo.toml files for their descriptions.
+      - name: Set env variables
+        run: |
+          CARGO_FEATURES="--features testing"
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
+            CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked --release"
+          fi
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV
+
+      # Disabled for now
+      # Don't include the ~/.cargo/registry/src directory. It contains just
+      # uncompressed versions of the crates in ~/.cargo/registry/cache
+      # directory, and it's faster to let 'cargo' to rebuild it from the
+      # compressed crates.
+#      - name: Cache cargo deps
+#        id: cache_cargo
+#        uses: actions/cache@v4
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+#          key: |
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      # Do install *before* running rust tests because they might recompile the
+      # binaries with different features/flags.
+      - name: Install rust binaries
+        run: |
+          # Install target binaries
+          mkdir -p /tmp/neon/bin/
+          binaries=$(
+            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
+            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+          )
+          for bin in $binaries; do
+            SRC=target/$BUILD_TYPE/$bin
+            DST=/tmp/neon/bin/$bin
+            cp "$SRC" "$DST"
+          done
+
+          # Install test executables and write list of all binaries (for code coverage)
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            # Keep bloated coverage data files away from the rest of the artifact
+            mkdir -p /tmp/coverage/
+
+            mkdir -p /tmp/neon/test_bin/
+
+            test_exe_paths=$(
+              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+            for bin in $test_exe_paths; do
+              SRC=$bin
+              DST=/tmp/neon/test_bin/$(basename $bin)
+
+              # We don't need debug symbols for code coverage, so strip them out to make
+              # the artifact smaller.
+              strip "$SRC" -o "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
+            done
+
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
+          fi
+
+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
+      - name: Install postgres binaries
+        run: cp -a pg_install /tmp/neon/pg_install
+
+      - name: Upload Neon artifact
+        uses: ./.github/actions/upload
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
+          path: /tmp/neon
+
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  regress-tests:
+    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
+    runs-on: [ self-hosted, gen3, large ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        pg_version: [ v14, v15, v16 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Pytest regression tests
+        uses: ./.github/actions/run-python-test-set
+        timeout-minutes: 60
+        with:
+          build_type: ${{ matrix.build_type }}
+          test_selection: regress
+          needs_postgres_source: true
+          run_with_real_s3: true
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
+          rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
+          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true
+
+      # Temporary disable this step until we figure out why it's so flaky
+      # Ref https://github.com/neondatabase/neon/issues/4540
+      - name: Merge and upload coverage data
+        if: |
+          false &&
+          matrix.build_type == 'debug' && matrix.pg_version == 'v14'
+        uses: ./.github/actions/save-coverage-data

-  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
  get-benchmarks-durations:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    outputs:
      json: ${{ steps.get-benchmark-durations.outputs.json }}
    needs: [ check-permissions, build-build-tools-image ]
@@ -246,6 +488,7 @@ jobs:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -270,8 +513,7 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
-    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -280,6 +522,7 @@ jobs:
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
      matrix:
@@ -327,7 +570,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -378,7 +621,7 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
+    needs: [ check-permissions, regress-tests, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -529,8 +772,7 @@ jobs:
          pull: true
          file: Dockerfile
          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
          tags: |
            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -623,8 +865,7 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -644,8 +885,7 @@ jobs:
          file: Dockerfile.compute-node
          target: neon-pg-ext-test
          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}

@@ -852,12 +1092,6 @@ jobs:
      VERSIONS: v14 v15 v16

    steps:
-      - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_RUNNER_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -989,7 +1223,7 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, gen3, small ]
@@ -1090,7 +1324,7 @@ jobs:
            })

  promote-compatibility-data:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
+    needs: [ check-permissions, promote-images, tag, regress-tests ]
    if: github.ref_name == 'release'

    runs-on: [ self-hosted, gen3, small ]
@@ -1129,7 +1363,7 @@ jobs:
          done

  pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
+    needs: [ build-build-tools-image, promote-images, regress-tests ]
    if: github.ref_name == 'main'
    uses: ./.github/workflows/pin-build-tools-image.yml
    with:
@@ -1151,7 +1385,7 @@ jobs:
    needs:
      - check-codestyle-python
      - check-codestyle-rust
-      - build-and-test-locally
+      - regress-tests
      - test-images
    runs-on: ubuntu-22.04
    steps:
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -133,6 +133,221 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    needs: [ check-permissions, build-build-tools-image ]
+    timeout-minutes: 90
+    runs-on: [ self-hosted, small-arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
+
+      - name: Run cargo test
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
+          cargo nextest run $CARGO_FEATURES -j$(nproc)
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
+
+  check-codestyle-rust-arm:
+    needs: [ check-permissions, build-build-tools-image ]
+    timeout-minutes: 90
+    runs-on: [ self-hosted, small-arm64 ]
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+
+      - name: Run cargo clippy (debug)
+        if: matrix.build_type == 'debug'
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        if: matrix.build_type == 'release'
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        if: matrix.build_type == 'release'
+        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
+        run: cargo deny check
+
  gather-rust-build-stats:
    needs: [ check-permissions, build-build-tools-image ]
    if: |
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -261,6 +261,15 @@ version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"

+[[package]]
+name = "atomic-polyfill"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289"
+dependencies = [
+ "critical-section",
+]
+
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -1442,6 +1451,12 @@ dependencies = [
 "itertools",
 ]

+[[package]]
+name = "critical-section"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.8"
@@ -2267,6 +2282,15 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "hash32"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -2315,6 +2339,18 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "heapless"
+version = "0.8.0"
+source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001"
+dependencies = [
+ "atomic-polyfill",
+ "hash32",
+ "rustc_version",
+ "spin 0.9.8",
+ "stable_deref_trait",
+]
+
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -2348,6 +2384,16 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"

+[[package]]
+name = "histogram"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b"
+dependencies = [
+ "serde",
+ "thiserror",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -4612,7 +4658,6 @@ name = "remote_storage"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-stream",
 "async-trait",
 "aws-config",
 "aws-credential-types",
@@ -5655,6 +5700,9 @@ name = "spin"
 version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]

 [[package]]
 name = "spki"
@@ -5676,6 +5724,12 @@ dependencies = [
 "der 0.7.8",
 ]

+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -5793,6 +5847,7 @@ dependencies = [
 "futures",
 "futures-util",
 "hex",
+ "histogram",
 "humantime",
 "itertools",
 "once_cell",
@@ -6772,6 +6827,7 @@ dependencies = [
 "criterion",
 "fail",
 "futures",
+ "heapless",
 "hex",
 "hex-literal",
 "humantime",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -204,6 +204,9 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

+## Other git libraries
+heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
+
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
--- a/18
+++ b/18
@@ -93,14 +93,13 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
 # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
-RUN mkdir -p /data/.neon/ && \
-  echo "id=1234" > "/data/.neon/identity.toml" && \
-  echo "broker_endpoint='http://storage_broker:50051'\n" \
-       "pg_distrib_dir='/usr/local/'\n" \
-       "listen_pg_addr='0.0.0.0:6400'\n" \
-       "listen_http_addr='0.0.0.0:9898'\n" \
-  > /data/.neon/pageserver.toml && \
-  chown -R neon:neon /data/.neon
+RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
+    && /usr/local/bin/pageserver -D /data/.neon/ --init \
+       -c "id=1234" \
+       -c "broker_endpoint='http://storage_broker:50051'" \
+       -c "pg_distrib_dir='/usr/local/'" \
+       -c "listen_pg_addr='0.0.0.0:6400'" \
+       -c "listen_http_addr='0.0.0.0:9898'"

 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
@@ -111,6 +110,3 @@ VOLUME ["/data"]
 USER neon
 EXPOSE 6400
 EXPOSE 9898
-
-CMD /usr/local/bin/pageserver -D /data/.neon
-
--- a/13
+++ b/13
@@ -69,8 +69,6 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
 # Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

-CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
-
 #
 # Top level Makefile to build Neon and PostgreSQL
 #
@@ -81,24 +79,15 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib cargo-target-dir
+neon: postgres-headers walproposer-lib
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
-.PHONY: cargo-target-dir
-cargo-target-dir:
-	# https://github.com/rust-lang/cargo/issues/14281
-	mkdir -p target
-	test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG

 ### PostgreSQL parts
 # Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
-
-	mkdir -p $(POSTGRES_INSTALL_DIR)
-	test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
-
 	+@echo "Configuring Postgres $* build"
 	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
 		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -21,9 +21,7 @@ use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
-use pageserver_api::controller_api::{
-    NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest,
-};
+use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
 use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
@@ -1252,70 +1250,9 @@ async fn handle_start_all(
            exit(1);
        }
    }
-
-    neon_start_status_check(env, retry_timeout).await?;
-
    Ok(())
 }

-async fn neon_start_status_check(
-    env: &local_env::LocalEnv,
-    retry_timeout: &Duration,
-) -> anyhow::Result<()> {
-    const RETRY_INTERVAL: Duration = Duration::from_millis(100);
-    const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5);
-
-    if env.control_plane_api.is_none() {
-        return Ok(());
-    }
-
-    let storcon = StorageController::from_env(env);
-
-    let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
-    let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis();
-
-    println!("\nRunning neon status check");
-
-    for retry in 0..retries {
-        if retry == notice_after_retries {
-            println!("\nNeon status check has not passed yet, continuing to wait")
-        }
-
-        let mut passed = true;
-        let mut nodes = storcon.node_list().await?;
-        let mut pageservers = env.pageservers.clone();
-
-        if nodes.len() != pageservers.len() {
-            continue;
-        }
-
-        nodes.sort_by_key(|ps| ps.id);
-        pageservers.sort_by_key(|ps| ps.id);
-
-        for (idx, pageserver) in pageservers.iter().enumerate() {
-            let node = &nodes[idx];
-            if node.id != pageserver.id {
-                passed = false;
-                break;
-            }
-
-            if !matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                passed = false;
-                break;
-            }
-        }
-
-        if passed {
-            println!("\nNeon started and passed status check");
-            return Ok(());
-        }
-
-        tokio::time::sleep(RETRY_INTERVAL).await;
-    }
-
-    anyhow::bail!("\nNeon passed status check")
-}
-
 async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let immediate =
        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -151,10 +151,7 @@ pub struct NeonBroker {
 pub struct NeonStorageControllerConf {
    /// Heartbeat timeout before marking a node offline
    #[serde(with = "humantime_serde")]
-    pub max_offline: Duration,
-
-    #[serde(with = "humantime_serde")]
-    pub max_warming_up: Duration,
+    pub max_unavailable: Duration,

    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,
@@ -162,16 +159,14 @@ pub struct NeonStorageControllerConf {

 impl NeonStorageControllerConf {
    // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
-    const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
+    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
+        std::time::Duration::from_secs(10);
 }

 impl Default for NeonStorageControllerConf {
    fn default() -> Self {
        Self {
-            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
-            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
+            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
            split_threshold: None,
        }
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -25,7 +25,6 @@ use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::auth::{Claims, Scope};
-use utils::id::NodeId;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -75,10 +74,6 @@ impl PageServerNode {
        }
    }

-    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
-        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
-    }
-
    fn pageserver_init_make_toml(
        &self,
        conf: NeonLocalInitPageserverConf,
@@ -191,19 +186,6 @@ impl PageServerNode {
            .write_all(config.to_string().as_bytes())
            .context("write pageserver toml")?;
        drop(config_file);
-
-        let identity_file_path = datadir.join("identity.toml");
-        let mut identity_file = std::fs::OpenOptions::new()
-            .create_new(true)
-            .write(true)
-            .open(identity_file_path)
-            .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
-        let identity_toml = self.pageserver_make_identity_toml(node_id);
-        identity_file
-            .write_all(identity_toml.to_string().as_bytes())
-            .context("write identity toml")?;
-        drop(identity_toml);
-
        // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config

        // Write metadata file, used by pageserver on startup to register itself with
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -5,9 +5,8 @@ use crate::{
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{
    controller_api::{
-        NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
-        TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
+        TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
    models::{
        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
@@ -354,10 +353,8 @@ impl StorageController {
            "--dev",
            "--database-url",
            &database_url,
-            "--max-offline-interval",
-            &humantime::Duration::from(self.config.max_offline).to_string(),
-            "--max-warming-up-interval",
-            &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--max-unavailable-interval",
+            &humantime::Duration::from(self.config.max_unavailable).to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -628,15 +625,6 @@ impl StorageController {
        .await
    }

-    pub async fn node_list(&self) -> anyhow::Result<Vec<NodeDescribeResponse>> {
-        self.dispatch::<(), Vec<NodeDescribeResponse>>(
-            Method::GET,
-            "control/v1/node".to_string(),
-            None,
-        )
-        .await
-    }
-
    #[instrument(skip(self))]
    pub async fn ready(&self) -> anyhow::Result<()> {
        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -33,7 +33,7 @@ echo $result | jq .

 generate_id timeline_id
 PARAMS=(
-     -sbf
+     -sb 
     -X POST
     -H "Content-Type: application/json"
     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -31,14 +31,25 @@ services:
    restart: always
    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
+      - BROKER_ENDPOINT='http://storage_broker:50051'
      - AWS_ACCESS_KEY_ID=minio
      - AWS_SECRET_ACCESS_KEY=password
      #- RUST_BACKTRACE=1
    ports:
       #- 6400:6400  # pg protocol handler
       - 9898:9898 # http endpoints
-    volumes:
-      - ./pageserver_config:/data/.neon/
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoint=$$BROKER_ENDPOINT\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
    depends_on:
      - storage_broker
      - minio_create_buckets
--- a/docker-compose/pageserver_config/identity.toml
+++ b/docker-compose/pageserver_config/identity.toml
@@ -1 +0,0 @@
-id=1234
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -1,5 +0,0 @@
-broker_endpoint='http://storage_broker:50051'
-pg_distrib_dir='/usr/local/'
-listen_pg_addr='0.0.0.0:6400'
-listen_http_addr='0.0.0.0:9898'
-remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
--- a/docs/rfcs/034-timeline-archive.md
+++ b/docs/rfcs/034-timeline-archive.md
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,4 @@
 use std::str::FromStr;
-use std::time::Instant;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -151,16 +150,11 @@ impl UtilizationScore {
    }
 }

-#[derive(Serialize, Clone, Copy, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
    Active(UtilizationScore),
-    // Node is warming up, but we expect it to become available soon. Covers
-    // the time span between the re-attach response being composed on the storage controller
-    // and the first successful heartbeat after the processing of the re-attach response
-    // finishes on the pageserver.
-    WarmingUp(Instant),
    // Offline: Tenants shouldn't try to attach here, but they may assume that their
    // secondary locations on this node still exist.  Newly added nodes are in this
    // state until we successfully contact them.
@@ -170,10 +164,7 @@ pub enum NodeAvailability {
 impl PartialEq for NodeAvailability {
    fn eq(&self, other: &Self) -> bool {
        use NodeAvailability::*;
-        matches!(
-            (self, other),
-            (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_))
-        )
+        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
    }
 }

@@ -185,7 +176,6 @@ impl Eq for NodeAvailability {}
 #[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
    Active,
-    WarmingUp,
    Offline,
 }

@@ -195,7 +185,6 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
-            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
    }
@@ -205,7 +194,6 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    fn from(val: NodeAvailability) -> Self {
        match val {
            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
-            NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp,
            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
        }
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -5,6 +5,7 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;

 use std::{
+    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
@@ -19,6 +20,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use utils::{
    completion,
+    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
    serde_system_time,
@@ -724,7 +726,58 @@ pub struct LayerMapInfo {
    pub historic_layers: Vec<HistoricLayerInfo>,
 }

-/// The residence status of a layer
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
+#[repr(usize)]
+pub enum LayerAccessKind {
+    GetValueReconstructData,
+    Iter,
+    KeyIter,
+    Dump,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LayerAccessStatFullDetails {
+    pub when_millis_since_epoch: u64,
+    pub task_kind: Cow<'static, str>,
+    pub access_kind: LayerAccessKind,
+}
+
+/// An event that impacts the layer's residence status.
+#[serde_as]
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LayerResidenceEvent {
+    /// The time when the event occurred.
+    /// NB: this timestamp is captured while the residence status changes.
+    /// So, it might be behind/ahead of the actual residence change by a short amount of time.
+    ///
+    #[serde(rename = "timestamp_millis_since_epoch")]
+    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
+    pub timestamp: SystemTime,
+    /// The new residence status of the layer.
+    pub status: LayerResidenceStatus,
+    /// The reason why we had to record this event.
+    pub reason: LayerResidenceEventReason,
+}
+
+/// The reason for recording a given [`LayerResidenceEvent`].
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum LayerResidenceEventReason {
+    /// The layer map is being populated, e.g. during timeline load or attach.
+    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
+    /// We need to record such events because there is no persistent storage for the events.
+    ///
+    // https://github.com/rust-lang/rust/issues/74481
+    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
+    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
+    LayerLoad,
+    /// We just created the layer (e.g., freeze_and_flush or compaction).
+    /// Such layers are always [`LayerResidenceStatus::Resident`].
+    LayerCreate,
+    /// We on-demand downloaded or evicted the given layer.
+    ResidenceChange,
+}
+
+/// The residence status of the layer, after the given [`LayerResidenceEvent`].
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceStatus {
    /// Residence status for a layer file that exists locally.
@@ -734,16 +787,23 @@ pub enum LayerResidenceStatus {
    Evicted,
 }

-#[serde_as]
+impl LayerResidenceEvent {
+    pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
+        Self {
+            status,
+            reason,
+            timestamp: SystemTime::now(),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
-    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
-    pub access_time: SystemTime,
-
-    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
-    pub residence_time: SystemTime,
-
-    pub visible: bool,
+    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
+    pub task_kind_access_flag: Vec<Cow<'static, str>>,
+    pub first: Option<LayerAccessStatFullDetails>,
+    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
+    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
-async-stream.workspace = true
 once_cell.workspace = true
 aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -15,7 +15,7 @@ use std::time::SystemTime;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::{Continuable, RetryOptions};
+use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
@@ -40,7 +40,6 @@ use crate::{

 pub struct AzureBlobStorage {
    client: ContainerClient,
-    container_name: String,
    prefix_in_container: Option<String>,
    max_keys_per_list_response: Option<NonZeroU32>,
    concurrency_limiter: ConcurrencyLimiter,
@@ -86,7 +85,6 @@ impl AzureBlobStorage {

        Ok(AzureBlobStorage {
            client,
-            container_name: azure_config.container_name.to_owned(),
            prefix_in_container: azure_config.prefix_in_container.to_owned(),
            max_keys_per_list_response,
            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
@@ -240,10 +238,6 @@ impl AzureBlobStorage {
            _ = cancel.cancelled() => Err(Cancelled),
        }
    }
-
-    pub fn container_name(&self) -> &str {
-        &self.container_name
-    }
 }

 fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
@@ -267,30 +261,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
 }

 impl RemoteStorage for AzureBlobStorage {
-    fn list_streaming(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
-            });
+    ) -> anyhow::Result<Listing, DownloadError> {
+        let _permit = self.permit(RequestKind::List, cancel).await?;

-        async_stream::stream! {
-            let _permit = self.permit(RequestKind::List, cancel).await?;
+        let op = async {
+            // get the passed prefix or if it is not set use prefix_in_bucket value
+            let list_prefix = prefix
+                .map(|p| self.relative_path_to_name(p))
+                .or_else(|| self.prefix_in_container.clone())
+                .map(|mut p| {
+                    // required to end with a separator
+                    // otherwise request will return only the entry of a prefix
+                    if matches!(mode, ListingMode::WithDelimiter)
+                        && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                    {
+                        p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    }
+                    p
+                });

            let mut builder = self.client.list_blobs();

@@ -306,43 +300,21 @@ impl RemoteStorage for AzureBlobStorage {
                builder = builder.max_results(MaxResults::new(limit));
            }

-            let mut next_marker = None;
+            let response = builder.into_stream();
+            let response = response.into_stream().map_err(to_download_error);
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
+            });

-            'outer: loop {
-                let mut builder = builder.clone();
-                if let Some(marker) = next_marker.clone() {
-                    builder = builder.marker(marker);
-                }
-                let response = builder.into_stream();
-                let response = response.into_stream().map_err(to_download_error);
-                let response = tokio_stream::StreamExt::timeout(response, self.timeout);
-                let response = response.map(|res| match res {
-                    Ok(res) => res,
-                    Err(_elapsed) => Err(DownloadError::Timeout),
-                });
+            let mut response = std::pin::pin!(response);

-                let mut response = std::pin::pin!(response);
+            let mut res = Listing::default();

-                let mut max_keys = max_keys.map(|mk| mk.get());
-                let next_item = tokio::select! {
-                    op = response.next() => Ok(op),
-                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-                }?;
-                let Some(entry) = next_item else {
-                    // The list is complete, so yield it.
-                    break;
-                };
-
-                let mut res = Listing::default();
-                let entry = match entry {
-                    Ok(entry) => entry,
-                    Err(e) => {
-                        // The error is potentially retryable, so we must rewind the loop after yielding.
-                        yield Err(e);
-                        continue;
-                    }
-                };
-                next_marker = entry.continuation();
+            let mut max_keys = max_keys.map(|mk| mk.get());
+            while let Some(entry) = response.next().await {
+                let entry = entry?;
                let prefix_iter = entry
                    .blobs
                    .prefixes()
@@ -361,19 +333,19 @@ impl RemoteStorage for AzureBlobStorage {
                        assert!(mk > 0);
                        mk -= 1;
                        if mk == 0 {
-                            yield Ok(res); // limit reached
-                            break 'outer;
+                            return Ok(res); // limit reached
                        }
                        max_keys = Some(mk);
                    }
                }
-                yield Ok(res);
-
-                // We are done here
-                if next_marker.is_none() {
-                    break;
-                }
            }
+
+            Ok(res)
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
        }
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -26,7 +26,7 @@ use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
-use futures::{stream::Stream, StreamExt};
+use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -160,18 +160,13 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`].
-    ///
-    /// The stream is guaranteed to return at least one element, even in the case of errors
-    /// (in that case it's an `Err()`), or an empty `Listing`.
-    ///
-    /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error.
-    /// The `next` function can be retried, and maybe in a future retry, there will be success.
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
    ///
    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
    /// from the absolute root of the bucket.
    ///
-    /// `mode` configures whether to use a delimiter.  Without a delimiter, all keys
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
    /// returned in `keys` ().
@@ -180,32 +175,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
    ///
-    /// [`ListObjectsV2`]: <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>
-    /// [`is_permanent`]: DownloadError::is_permanent
-    fn list_streaming(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>>;
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
+        _mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError> {
-        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
-        let mut combined = stream.next().await.expect("At least one item required")?;
-        while let Some(list) = stream.next().await {
-            let list = list?;
-            combined.keys.extend_from_slice(&list.keys);
-            combined.prefixes.extend_from_slice(&list.prefixes);
-        }
-        Ok(combined)
-    }
+    ) -> Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    ///
@@ -312,8 +288,8 @@ impl Debug for Download {

 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
-// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
 #[derive(Clone)]
+// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
 pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
@@ -322,14 +298,13 @@ pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
 }

 impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
-    // See [`RemoteStorage::list`].
    pub async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError> {
+    ) -> anyhow::Result<Listing, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
            Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
@@ -338,23 +313,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    // See [`RemoteStorage::list_streaming`].
-    pub fn list_streaming<'a>(
-        &'a self,
-        prefix: Option<&'a RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
-        match self {
-            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
-            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
-            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
-            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
-        }
-    }
-
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -546,16 +504,6 @@ impl GenericRemoteStorage {
            None => self.download(from, cancel).await,
        }
    }
-
-    /// The name of the bucket/container/etc.
-    pub fn bucket_name(&self) -> Option<&str> {
-        match self {
-            Self::LocalFs(_s) => None,
-            Self::AwsS3(s) => Some(s.bucket_name()),
-            Self::AzureBlob(s) => Some(s.container_name()),
-            Self::Unreliable(_s) => None,
-        }
-    }
 }

 /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -331,17 +331,6 @@ impl LocalFs {
 }

 impl RemoteStorage for LocalFs {
-    fn list_streaming(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
-        let listing = self.list(prefix, mode, max_keys, cancel);
-        futures::stream::once(listing)
-    }
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -386,10 +386,6 @@ impl S3Bucket {
        }
        Ok(())
    }
-
-    pub fn bucket_name(&self) -> &str {
-        &self.bucket_name
-    }
 }

 pin_project_lite::pin_project! {
@@ -467,16 +463,17 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
 }

 impl RemoteStorage for S3Bucket {
-    fn list_streaming(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
        // s3 sdk wants i32
        let mut max_keys = max_keys.map(|mk| mk.get() as i32);
+        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -488,99 +485,89 @@ impl RemoteStorage for S3Bucket {
                })
            });

-        async_stream::stream! {
-            let _permit = self.permit(kind, cancel).await?;
+        let _permit = self.permit(kind, cancel).await?;

-            let mut continuation_token = None;
-            'outer: loop {
-                let started_at = start_measuring_requests(kind);
+        let mut continuation_token = None;

-                // min of two Options, returning Some if one is value and another is
-                // None (None is smaller than anything, so plain min doesn't work).
-                let request_max_keys = self
-                    .max_keys_per_list_response
-                    .into_iter()
-                    .chain(max_keys.into_iter())
-                    .min();
-                let mut request = self
-                    .client
-                    .list_objects_v2()
-                    .bucket(self.bucket_name.clone())
-                    .set_prefix(list_prefix.clone())
-                    .set_continuation_token(continuation_token.clone())
-                    .set_max_keys(request_max_keys);
+        loop {
+            let started_at = start_measuring_requests(kind);

-                if let ListingMode::WithDelimiter = mode {
-                    request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-                }
+            // min of two Options, returning Some if one is value and another is
+            // None (None is smaller than anything, so plain min doesn't work).
+            let request_max_keys = self
+                .max_keys_per_list_response
+                .into_iter()
+                .chain(max_keys.into_iter())
+                .min();
+            let mut request = self
+                .client
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(list_prefix.clone())
+                .set_continuation_token(continuation_token)
+                .set_max_keys(request_max_keys);

-                let request = request.send();
-
-                let response = tokio::select! {
-                    res = request => Ok(res),
-                    _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
-                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-                }?;
-
-                let response = response
-                    .context("Failed to list S3 prefixes")
-                    .map_err(DownloadError::Other);
-
-                let started_at = ScopeGuard::into_inner(started_at);
-
-                crate::metrics::BUCKET_METRICS
-                    .req_seconds
-                    .observe_elapsed(kind, &response, started_at);
-
-                let response = match response {
-                    Ok(response) => response,
-                    Err(e) => {
-                        // The error is potentially retryable, so we must rewind the loop after yielding.
-                        yield Err(e);
-                        continue 'outer;
-                    },
-                };
-
-                let keys = response.contents();
-                let prefixes = response.common_prefixes.as_deref().unwrap_or_default();
-
-                tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
-                let mut result = Listing::default();
-
-                for object in keys {
-                    let object_path = object.key().expect("response does not contain a key");
-                    let remote_path = self.s3_object_to_relative_path(object_path);
-                    result.keys.push(remote_path);
-                    if let Some(mut mk) = max_keys {
-                        assert!(mk > 0);
-                        mk -= 1;
-                        if mk == 0 {
-                            // limit reached
-                            yield Ok(result);
-                            break 'outer;
-                        }
-                        max_keys = Some(mk);
-                    }
-                }
-
-                // S3 gives us prefixes like "foo/", we return them like "foo"
-                result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                    Some(
-                        self.s3_object_to_relative_path(
-                            o.prefix()?
-                                .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
-                        ),
-                    )
-                }));
-
-                yield Ok(result);
-
-                continuation_token = match response.next_continuation_token {
-                    Some(new_token) => Some(new_token),
-                    None => break,
-                };
+            if let ListingMode::WithDelimiter = mode {
+                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
            }
+
+            let request = request.send();
+
+            let response = tokio::select! {
+                res = request => res,
+                _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+                _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+            };
+
+            let response = response
+                .context("Failed to list S3 prefixes")
+                .map_err(DownloadError::Other);
+
+            let started_at = ScopeGuard::into_inner(started_at);
+
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;
+
+            let keys = response.contents();
+            let empty = Vec::new();
+            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
+
+            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+
+            for object in keys {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                result.keys.push(remote_path);
+                if let Some(mut mk) = max_keys {
+                    assert!(mk > 0);
+                    mk -= 1;
+                    if mk == 0 {
+                        return Ok(result); // limit reached
+                    }
+                    max_keys = Some(mk);
+                }
+            }
+
+            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                Some(
+                    self.s3_object_to_relative_path(
+                        o.prefix()?
+                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                    ),
+                )
+            }));
+
+            continuation_token = match response.next_continuation_token {
+                Some(new_token) => Some(new_token),
+                None => break,
+            };
        }
+
+        Ok(result)
    }

    async fn upload(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,7 +3,6 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
-use futures::StreamExt;
 use std::collections::HashMap;
 use std::num::NonZeroU32;
 use std::sync::Mutex;
@@ -108,23 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
-    fn list_streaming(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
-        async_stream::stream! {
-            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-                .map_err(DownloadError::Other)?;
-            let mut stream = self.inner
-                .list_streaming(prefix, mode, max_keys, cancel);
-            while let Some(item) = stream.next().await {
-                yield item;
-            }
-        }
-    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data(
    let mut upload_tasks = JoinSet::new();
    let cancel = CancellationToken::new();

-    for i in 1..=upload_tasks_count {
+    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        let cancel = cancel.clone();

--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8Path;
-use futures::StreamExt;
 use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
@@ -30,10 +29,10 @@ use super::{
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
-/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response.
-/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// as the current default AWS S3 pagination limit is 1000.
-/// (see <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>).
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
@@ -88,41 +87,6 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
    );

-    // list_streaming
-
-    let prefix_with_slash = base_prefix.add_trailing_slash();
-    let mut nested_remote_prefixes_st = test_client.list_streaming(
-        Some(&prefix_with_slash),
-        ListingMode::WithDelimiter,
-        None,
-        &cancel,
-    );
-    let mut nested_remote_prefixes_combined = HashSet::new();
-    let mut segments = 0;
-    let mut segment_max_size = 0;
-    while let Some(st) = nested_remote_prefixes_st.next().await {
-        let st = st?;
-        segment_max_size = segment_max_size.max(st.prefixes.len());
-        nested_remote_prefixes_combined.extend(st.prefixes.into_iter());
-        segments += 1;
-    }
-    assert!(segments > 1, "less than 2 segments: {segments}");
-    assert!(
-        segment_max_size * 2 <= nested_remote_prefixes_combined.len(),
-        "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}",
-        nested_remote_prefixes_combined.len()
-    );
-    let remote_only_prefixes = nested_remote_prefixes_combined
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes_combined)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
    Ok(())
 }

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -20,6 +20,7 @@ bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
+heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -0,0 +1,196 @@
+//! A heapless buffer for events of sorts.
+
+use std::ops;
+
+use heapless::HistoryBuffer;
+
+#[derive(Debug, Clone)]
+pub struct HistoryBufferWithDropCounter<T, const L: usize> {
+    buffer: HistoryBuffer<T, L>,
+    drop_count: u64,
+}
+
+impl<T, const L: usize> HistoryBufferWithDropCounter<T, L> {
+    pub fn write(&mut self, data: T) {
+        let len_before = self.buffer.len();
+        self.buffer.write(data);
+        let len_after = self.buffer.len();
+        self.drop_count += u64::from(len_before == len_after);
+    }
+    pub fn drop_count(&self) -> u64 {
+        self.drop_count
+    }
+    pub fn map<U, F: Fn(&T) -> U>(&self, f: F) -> HistoryBufferWithDropCounter<U, L> {
+        let mut buffer = HistoryBuffer::new();
+        buffer.extend(self.buffer.oldest_ordered().map(f));
+        HistoryBufferWithDropCounter::<U, L> {
+            buffer,
+            drop_count: self.drop_count,
+        }
+    }
+}
+
+impl<T, const L: usize> Default for HistoryBufferWithDropCounter<T, L> {
+    fn default() -> Self {
+        Self {
+            buffer: HistoryBuffer::default(),
+            drop_count: 0,
+        }
+    }
+}
+
+impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
+    type Target = HistoryBuffer<T, L>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.buffer
+    }
+}
+
+#[derive(serde::Serialize, serde::Deserialize)]
+struct SerdeRepr<T> {
+    buffer: Vec<T>,
+    buffer_size: usize,
+    drop_count: u64,
+}
+
+impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter<T, L>> for SerdeRepr<T>
+where
+    T: Clone + serde::Serialize,
+{
+    fn from(value: &'a HistoryBufferWithDropCounter<T, L>) -> Self {
+        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
+        SerdeRepr {
+            buffer: buffer.iter().cloned().collect(),
+            buffer_size: L,
+            drop_count: *drop_count,
+        }
+    }
+}
+
+impl<T, const L: usize> serde::Serialize for HistoryBufferWithDropCounter<T, L>
+where
+    T: Clone + serde::Serialize,
+{
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        SerdeRepr::from(self).serialize(serializer)
+    }
+}
+
+impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
+where
+    T: Clone + serde::Deserialize<'de>,
+{
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let SerdeRepr {
+            buffer: des_buffer,
+            drop_count,
+            buffer_size,
+        } = SerdeRepr::<T>::deserialize(deserializer)?;
+        if buffer_size != L {
+            use serde::de::Error;
+            return Err(D::Error::custom(format!(
+                "invalid buffer_size, expecting {L} got {buffer_size}"
+            )));
+        }
+        let mut buffer = HistoryBuffer::new();
+        buffer.extend(des_buffer);
+        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::HistoryBufferWithDropCounter;
+
+    #[test]
+    fn test_basics() {
+        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
+        b.write(1);
+        b.write(2);
+        b.write(3);
+        assert!(b.iter().any(|e| *e == 2));
+        assert!(b.iter().any(|e| *e == 3));
+        assert!(!b.iter().any(|e| *e == 1));
+
+        // round-trip serde
+        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
+            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
+        assert_eq!(
+            round_tripped.iter().cloned().collect::<Vec<_>>(),
+            b.iter().cloned().collect::<Vec<_>>()
+        );
+    }
+
+    #[test]
+    fn test_drop_count_works() {
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        b.write(1);
+        assert_eq!(b.drop_count(), 0);
+        b.write(2);
+        assert_eq!(b.drop_count(), 0);
+        b.write(3);
+        assert_eq!(b.drop_count(), 1);
+        b.write(4);
+        assert_eq!(b.drop_count(), 2);
+    }
+
+    #[test]
+    fn test_clone_works() {
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        b.write(1);
+        b.write(2);
+        b.write(3);
+        assert_eq!(b.drop_count(), 1);
+        let mut c = b.clone();
+        assert_eq!(c.drop_count(), 1);
+        assert!(c.iter().any(|e| *e == 2));
+        assert!(c.iter().any(|e| *e == 3));
+        assert!(!c.iter().any(|e| *e == 1));
+
+        c.write(4);
+        assert!(c.iter().any(|e| *e == 4));
+        assert!(!b.iter().any(|e| *e == 4));
+    }
+
+    #[test]
+    fn test_map() {
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+
+        b.write(1);
+        assert_eq!(b.drop_count(), 0);
+        {
+            let c = b.map(|i| i + 10);
+            assert_eq!(c.oldest_ordered().cloned().collect::<Vec<_>>(), vec![11]);
+            assert_eq!(c.drop_count(), 0);
+        }
+
+        b.write(2);
+        assert_eq!(b.drop_count(), 0);
+        {
+            let c = b.map(|i| i + 10);
+            assert_eq!(
+                c.oldest_ordered().cloned().collect::<Vec<_>>(),
+                vec![11, 12]
+            );
+            assert_eq!(c.drop_count(), 0);
+        }
+
+        b.write(3);
+        assert_eq!(b.drop_count(), 1);
+        {
+            let c = b.map(|i| i + 10);
+            assert_eq!(
+                c.oldest_ordered().cloned().collect::<Vec<_>>(),
+                vec![12, 13]
+            );
+            assert_eq!(c.drop_count(), 1);
+        }
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -59,6 +59,8 @@ pub mod signals;

 pub mod fs_ext;

+pub mod history_buffer;
+
 pub mod measured_stream;

 pub mod serde_percent;
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -49,7 +49,6 @@ pub struct TenantShardId {

 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
-    pub const MIN: Self = Self(0);

    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,35 +2,30 @@

 //! Main entry point for the Page Server executable.

-use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
+use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
-use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
-};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
-use tokio_util::sync::CancellationToken;
 use tracing::*;

 use metrics::set_build_info_metric;
 use pageserver::{
-    config::PageServerConf,
+    config::{defaults::*, PageServerConf},
    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
@@ -89,13 +84,18 @@ fn main() -> anyhow::Result<()> {
        .with_context(|| format!("Error opening workdir '{workdir}'"))?;

    let cfg_file_path = workdir.join("pageserver.toml");
-    let identity_file_path = workdir.join("identity.toml");

    // Set CWD to workdir for non-daemon modes
    env::set_current_dir(&workdir)
        .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;

-    let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
+    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
+        ControlFlow::Continue(conf) => conf,
+        ControlFlow::Break(()) => {
+            info!("Pageserver config init successful");
+            return Ok(());
+        }
+    };

    // Initialize logging.
    //
@@ -150,55 +150,70 @@ fn main() -> anyhow::Result<()> {
 }

 fn initialize_config(
-    identity_file_path: &Utf8Path,
    cfg_file_path: &Utf8Path,
+    arg_matches: clap::ArgMatches,
    workdir: &Utf8Path,
-) -> anyhow::Result<&'static PageServerConf> {
-    // The deployment orchestrator writes out an indentity file containing the node id
-    // for all pageservers. This file is the source of truth for the node id. In order
-    // to allow for rolling back pageserver releases, the node id is also included in
-    // the pageserver config that the deployment orchestrator writes to disk for the pageserver.
-    // A rolled back version of the pageserver will get the node id from the pageserver.toml
-    // config file.
-    let identity = match std::fs::File::open(identity_file_path) {
+) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
+    let init = arg_matches.get_flag("init");
+
+    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
        Ok(mut f) => {
-            let md = f.metadata().context("stat config file")?;
-            if !md.is_file() {
-                anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
+            if init {
+                anyhow::bail!("config file already exists: {cfg_file_path}");
            }
-
-            let mut s = String::new();
-            f.read_to_string(&mut s).context("read identity file")?;
-            toml_edit::de::from_str::<PageserverIdentity>(&s)?
-        }
-        Err(e) => {
-            anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
-        }
-    };
-
-    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
-        Ok(mut f) => {
            let md = f.metadata().context("stat config file")?;
            if md.is_file() {
                let mut s = String::new();
                f.read_to_string(&mut s).context("read config file")?;
-                s.parse().context("parse config file toml")?
+                Some(s.parse().context("parse config file toml")?)
            } else {
                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
            }
        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
        Err(e) => {
            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
        }
    };

-    debug!("Using pageserver toml: {config}");
+    let mut effective_config = file_contents.unwrap_or_else(|| {
+        DEFAULT_CONFIG_FILE
+            .parse()
+            .expect("unit tests ensure this works")
+    });
+
+    // Patch with overrides from the command line
+    if let Some(values) = arg_matches.get_many::<String>("config-override") {
+        for option_line in values {
+            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
+                format!("Option '{option_line}' could not be parsed as a toml document")
+            })?;
+
+            for (key, item) in doc.iter() {
+                effective_config.insert(key, item.clone());
+            }
+        }
+    }
+
+    debug!("Resulting toml: {effective_config}");

    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
+    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
        .context("Failed to parse pageserver configuration")?;

-    Ok(Box::leak(Box::new(conf)))
+    if init {
+        info!("Writing pageserver config to '{cfg_file_path}'");
+
+        std::fs::write(cfg_file_path, effective_config.to_string())
+            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
+        info!("Config successfully written to '{cfg_file_path}'")
+    }
+
+    Ok(if init {
+        ControlFlow::Break(())
+    } else {
+        ControlFlow::Continue(Box::leak(Box::new(conf)))
+    })
 }

 struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -290,7 +305,6 @@ fn start_pageserver(
    // Create and lock PID file. This ensures that there cannot be more than one
    // pageserver process running at the same time.
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    info!("Claiming pid file at {lock_file_path:?}...");
    let lock_file =
        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
    info!("Claimed pid file at {lock_file_path:?}");
@@ -416,10 +430,8 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let background_purges = mgr::BackgroundPurges::default();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
-        background_purges.clone(),
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
@@ -511,7 +523,7 @@ fn start_pageserver(
        }
    });

-    let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks(
+    let secondary_controller = secondary::spawn_tasks(
        tenant_manager.clone(),
        remote_storage.clone(),
        background_jobs_barrier.clone(),
@@ -524,19 +536,18 @@ fn start_pageserver(
    // been configured.
    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();

-    let disk_usage_eviction_task = launch_disk_usage_global_eviction_task(
+    launch_disk_usage_global_eviction_task(
        conf,
        remote_storage.clone(),
        disk_usage_eviction_state.clone(),
        tenant_manager.clone(),
        background_jobs_barrier.clone(),
-    );
+    )?;

    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
-    let http_endpoint_listener = {
-        let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
-        let cancel = CancellationToken::new();
+    {
+        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

        let router_state = Arc::new(
            http::routes::State::new(
@@ -557,44 +568,77 @@ fn start_pageserver(
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
-            .with_graceful_shutdown({
-                let cancel = cancel.clone();
-                async move { cancel.clone().cancelled().await }
-            });
+            .with_graceful_shutdown(task_mgr::shutdown_watcher());

-        let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        task_mgr::spawn(
+            MGMT_REQUEST_RUNTIME.handle(),
+            TaskKind::HttpEndpointListener,
+            None,
+            None,
            "http endpoint listener",
-            server,
-        ));
-        HttpEndpointListener(CancellableTask { task, cancel })
-    };
+            true,
+            async {
+                server.await?;
+                Ok(())
+            },
+        );
+    }

-    let consumption_metrics_tasks = {
-        let cancel = shutdown_pageserver.child_token();
-        let task = crate::BACKGROUND_RUNTIME.spawn({
-            let tenant_manager = tenant_manager.clone();
-            let cancel = cancel.clone();
-            async move {
-                // first wait until background jobs are cleared to launch.
-                //
-                // this is because we only process active tenants and timelines, and the
-                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                // which will not be rate-limited.
-                tokio::select! {
-                    _ = cancel.cancelled() => { return; },
-                    _ = background_jobs_barrier.wait() => {}
-                };
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let metrics_ctx = RequestContext::todo_child(
+            TaskKind::MetricsCollection,
+            // This task itself shouldn't download anything.
+            // The actual size calculation does need downloads, and
+            // creates a child context with the right DownloadBehavior.
+            DownloadBehavior::Error,
+        );

-                pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await;
-            }
-        });
-        ConsumptionMetricsTasks(CancellableTask { task, cancel })
-    };
+        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
+
+        task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            None,
+            None,
+            "consumption metrics collection",
+            true,
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    // first wait until background jobs are cleared to launch.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let cancel = task_mgr::shutdown_token();
+
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };
+
+                    pageserver::consumption_metrics::collect_metrics(
+                        tenant_manager,
+                        metric_collection_endpoint,
+                        &conf.metric_collection_bucket,
+                        conf.metric_collection_interval,
+                        conf.synthetic_size_calculation_interval,
+                        conf.id,
+                        local_disk_storage,
+                        cancel,
+                        metrics_ctx,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                }
+            },
+        );
+    }

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let libpq_listener = {
-        let cancel = CancellationToken::new();
+    {
        let libpq_ctx = RequestContext::todo_child(
            TaskKind::LibpqEndpointListener,
            // listener task shouldn't need to download anything. (We will
@@ -603,20 +647,29 @@ fn start_pageserver(
            // accept connections.)
            DownloadBehavior::Error,
        );
-
-        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "libpq listener",
-            page_service::libpq_listener_main(
-                tenant_manager.clone(),
-                pg_auth,
-                pageserver_listener,
-                conf.pg_auth_type,
-                libpq_ctx,
-                cancel.clone(),
-            ),
-        ));
-        LibpqEndpointListener(CancellableTask { task, cancel })
-    };
+        task_mgr::spawn(
+            COMPUTE_REQUEST_RUNTIME.handle(),
+            TaskKind::LibpqEndpointListener,
+            None,
+            None,
+            "libpq endpoint listener",
+            true,
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    page_service::libpq_listener_main(
+                        tenant_manager,
+                        pg_auth,
+                        pageserver_listener,
+                        conf.pg_auth_type,
+                        libpq_ctx,
+                        task_mgr::shutdown_token(),
+                    )
+                    .await
+                }
+            },
+        );
+    }

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -642,18 +695,7 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(
-                http_endpoint_listener,
-                libpq_listener,
-                consumption_metrics_tasks,
-                disk_usage_eviction_task,
-                &tenant_manager,
-                background_purges,
-                deletion_queue.clone(),
-                secondary_controller_tasks,
-                0,
-            )
-            .await;
+            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
            unreachable!()
        })
    }
@@ -692,12 +734,28 @@ fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(version())
+        .arg(
+            Arg::new("init")
+                .long("init")
+                .action(ArgAction::SetTrue)
+                .help("Initialize pageserver with all given config overrides"),
+        )
        .arg(
            Arg::new("workdir")
                .short('D')
                .long("workdir")
                .help("Working directory for the pageserver"),
        )
+        // See `settings.md` for more details on the extra configuration patameters pageserver can process
+        .arg(
+            Arg::new("config-override")
+                .long("config-override")
+                .short('c')
+                .num_args(1)
+                .action(ArgAction::Append)
+                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
+                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
+        )
        .arg(
            Arg::new("enabled-features")
                .long("enabled-features")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,8 +7,8 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
+use serde;
 use serde::de::IntoDeserializer;
-use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -406,13 +406,6 @@ struct PageServerConfigBuilder {
 }

 impl PageServerConfigBuilder {
-    fn new(node_id: NodeId) -> Self {
-        let mut this = Self::default();
-        this.id(node_id);
-
-        this
-    }
-
    #[inline(always)]
    fn default_values() -> Self {
        use self::BuilderValue::*;
@@ -888,12 +881,8 @@ impl PageServerConf {
    /// validating the input and failing on errors.
    ///
    /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(
-        node_id: NodeId,
-        toml: &Document,
-        workdir: &Utf8Path,
-    ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new(node_id);
+    pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
+        let mut builder = PageServerConfigBuilder::default();
        builder.workdir(workdir.to_owned());

        let mut t_conf = TenantConfOpt::default();
@@ -924,8 +913,7 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
-                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
-                            // Logging is not set up yet, so we can't do it.
+                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -1102,12 +1090,6 @@ impl PageServerConf {
    }
 }

-#[derive(Deserialize)]
-#[serde(deny_unknown_fields)]
-pub struct PageserverIdentity {
-    pub id: NodeId,
-}
-
 // Helper functions to parse a toml Item

 fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
@@ -1277,7 +1259,7 @@ background_task_maximum_delay = '334 s'
        );
        let toml = config_string.parse()?;

-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
@@ -1359,7 +1341,7 @@ background_task_maximum_delay = '334 s'
        );
        let toml = config_string.parse()?;

-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
@@ -1449,13 +1431,12 @@ broker_endpoint = '{broker_endpoint}'

            let toml = config_string.parse()?;

-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for the local FS");
+            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                })
+                .remote_storage_config
+                .expect("Should have remote storage config for the local FS");

            assert_eq!(
                parsed_remote_storage_config,
@@ -1511,13 +1492,12 @@ broker_endpoint = '{broker_endpoint}'

            let toml = config_string.parse()?;

-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for S3");
+            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                })
+                .remote_storage_config
+                .expect("Should have remote storage config for S3");

            assert_eq!(
                parsed_remote_storage_config,
@@ -1596,7 +1576,7 @@ threshold = "20m"
 "#,
        );
        let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;

        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
        assert_eq!(
@@ -1612,11 +1592,7 @@ threshold = "20m"
                .evictions_low_residence_duration_metric_threshold,
            Duration::from_secs(20 * 60)
        );
-
-        // Assert that the node id provided by the indentity file (threaded
-        // through the call to [`PageServerConf::parse_and_validate`] is
-        // used.
-        assert_eq!(conf.id, NodeId(333));
+        assert_eq!(conf.id, NodeId(222));
        assert_eq!(
            conf.disk_usage_based_eviction,
            Some(DiskUsageEvictionTaskConfig {
@@ -1625,7 +1601,7 @@ threshold = "20m"
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: Default::default(),
+                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
            })
        );

@@ -1661,7 +1637,7 @@ threshold = "20m"
 "#,
        );
        let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();

        match &conf.default_tenant_conf.eviction_policy {
            EvictionPolicy::OnlyImitiate(t) => {
@@ -1680,7 +1656,7 @@ threshold = "20m"
 remote_storage = {}
        "#;
        let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
+        let err = PageServerConf::parse_and_validate(&doc, &workdir)
            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
        assert!(format!("{err}").contains("remote_storage"), "{err}");
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,6 +1,5 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
-use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::size::CalculateSyntheticSizeError;
@@ -40,74 +39,49 @@ type RawMetric = (MetricsKey, (EventType, u64));
 /// for deduplication, but that is no longer needed.
 type Cache = HashMap<MetricsKey, (EventType, u64)>;

-pub async fn run(
-    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    cancel: CancellationToken,
-) {
-    let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else {
-        return;
-    };
-
-    let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
-
-    let metrics_ctx = RequestContext::todo_child(
-        TaskKind::MetricsCollection,
-        // This task itself shouldn't download anything.
-        // The actual size calculation does need downloads, and
-        // creates a child context with the right DownloadBehavior.
-        DownloadBehavior::Error,
-    );
-    let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-        "consumption metrics collection",
-        collect_metrics(
-            tenant_manager.clone(),
-            metric_collection_endpoint,
-            &conf.metric_collection_bucket,
-            conf.metric_collection_interval,
-            conf.id,
-            local_disk_storage,
-            cancel.clone(),
-            metrics_ctx,
-        )
-        .instrument(info_span!("metrics_collection")),
-    ));
-
-    let worker_ctx =
-        RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
-    let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-        "synthetic size calculation",
-        calculate_synthetic_size_worker(
-            tenant_manager.clone(),
-            conf.synthetic_size_calculation_interval,
-            cancel.clone(),
-            worker_ctx,
-        )
-        .instrument(info_span!("synthetic_size_worker")),
-    ));
-
-    let (collect_metrics, synthetic_size_worker) =
-        futures::future::join(collect_metrics, synthetic_size_worker).await;
-    collect_metrics
-        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
-    synthetic_size_worker
-        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
-}
-
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
-async fn collect_metrics(
+pub async fn collect_metrics(
    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
+    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
+    // spin up background worker that caclulates tenant sizes
+    let worker_ctx =
+        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::CalculateSyntheticSize,
+        None,
+        None,
+        "synthetic size calculation",
+        false,
+        {
+            let tenant_manager = tenant_manager.clone();
+            async move {
+                calculate_synthetic_size_worker(
+                    tenant_manager,
+                    synthetic_size_calculation_interval,
+                    &cancel,
+                    &worker_ctx,
+                )
+                .instrument(info_span!("synthetic_size_worker"))
+                .await?;
+                Ok(())
+            }
+        },
+    );
+
    let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);

+    let cancel = task_mgr::shutdown_token();
+
    let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);

    let mut cached_metrics = tokio::select! {
@@ -194,9 +168,11 @@ async fn collect_metrics(
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );

-        let res =
-            tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled())
-                .await;
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
        if res.is_ok() {
            return Ok(());
        }
@@ -296,8 +272,8 @@ async fn reschedule(
 async fn calculate_synthetic_size_worker(
    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
-    cancel: CancellationToken,
-    ctx: RequestContext,
+    cancel: &CancellationToken,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
    scopeguard::defer! {
@@ -337,7 +313,7 @@ async fn calculate_synthetic_size_worker(
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
-            calculate_and_log(&tenant, &cancel, &ctx).await;
+            calculate_and_log(&tenant, cancel, ctx).await;
        }

        crate::tenant::tasks::warn_when_period_overrun(
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -171,14 +171,14 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            register,
        };

+        fail::fail_point!("control-plane-client-re-attach");
+
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
            response.tenants.len()
        );

-        failpoint_support::sleep_millis_async!("control-plane-client-re-attach");
-
        Ok(response
            .tenants
            .into_iter()
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -59,14 +59,13 @@ use utils::{completion, id::TimelineId};
 use crate::{
    config::PageServerConf,
    metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
    },
-    CancellableTask, DiskUsageEvictionTask,
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -84,9 +83,17 @@ pub struct DiskUsageEvictionTaskConfig {

 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "type", content = "args")]
 pub enum EvictionOrder {
+    /// Order the layers to be evicted by how recently they have been accessed in absolute
+    /// time.
+    ///
+    /// This strategy is unfair when some tenants grow faster than others towards the slower
+    /// growing.
+    #[default]
+    AbsoluteAccessed,
+
    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
    RelativeAccessed {
@@ -101,14 +108,6 @@ pub enum EvictionOrder {
    },
 }

-impl Default for EvictionOrder {
-    fn default() -> Self {
-        Self::RelativeAccessed {
-            highest_layer_count_loses_first: true,
-        }
-    }
-}
-
 fn default_highest_layer_count_loses_first() -> bool {
    true
 }
@@ -118,6 +117,11 @@ impl EvictionOrder {
        use EvictionOrder::*;

        match self {
+            AbsoluteAccessed => {
+                candidates.sort_unstable_by_key(|(partition, candidate)| {
+                    (*partition, candidate.last_activity_ts)
+                });
+            }
            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
                (*partition, candidate.relative_last_activity)
            }),
@@ -130,6 +134,7 @@ impl EvictionOrder {
        use EvictionOrder::*;

        match self {
+            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
            RelativeAccessed {
                highest_layer_count_loses_first,
            } => {
@@ -187,34 +192,36 @@ pub fn launch_disk_usage_global_eviction_task(
    state: Arc<State>,
    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
-) -> Option<DiskUsageEvictionTask> {
+) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
-        return None;
+        return Ok(());
    };

    info!("launching disk usage based eviction task");

-    let cancel = CancellationToken::new();
-    let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
        "disk usage based eviction",
-        {
-            let cancel = cancel.clone();
-            async move {
-                // wait until initial load is complete, because we cannot evict from loading tenants.
-                tokio::select! {
-                    _ = cancel.cancelled() => { return anyhow::Ok(()); },
-                    _ = background_jobs_barrier.wait() => { }
-                };
+        false,
+        async move {
+            let cancel = task_mgr::shutdown_token();

-                disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
-                    .await;
-                anyhow::Ok(())
-            }
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            tokio::select! {
+                _ = cancel.cancelled() => { return Ok(()); },
+                _ = background_jobs_barrier.wait() => { }
+            };
+
+            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
+            Ok(())
        },
-    ));
+    );

-    Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
+    Ok(())
 }

 #[instrument(skip_all)]
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -414,7 +414,7 @@ paths:
        Either archives or unarchives the given timeline.
        An archived timeline may not have any non-archived children.
      requestBody:
-        required: true
+        required: false
        content:
          application/json:
            schema:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1650,9 +1650,7 @@ async fn timeline_compact_handler(
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
        }
        json_response(StatusCode::OK, ())
    }
@@ -1678,10 +1676,6 @@ async fn timeline_checkpoint_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
-
-    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
-    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
-
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

@@ -1698,22 +1692,18 @@ async fn timeline_checkpoint_handler(

                }
            })?;
-        if compact {
-            timeline
-                .compact(&cancel, flags, &ctx)
-                .await
-                .map_err(|e|
-                    match e {
-                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Other(e) => ApiError::InternalServerError(e)
-                    }
-                )?;
-        }
+        timeline
+            .compact(&cancel, flags, &ctx)
+            .await
+            .map_err(|e|
+                match e {
+                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                    CompactionError::Other(e) => ApiError::InternalServerError(e)
+                }
+            )?;

        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
        }

        json_response(StatusCode::OK, ())
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,7 +13,6 @@ pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
 pub use pageserver_api::keyspace;
-use tokio_util::sync::CancellationToken;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
@@ -33,10 +32,7 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
-use tenant::{
-    mgr::{BackgroundPurges, TenantManager},
-    secondary,
-};
+use tenant::mgr::TenantManager;
 use tracing::info;

 /// Current storage format version
@@ -58,39 +54,17 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

-pub struct CancellableTask {
-    pub task: tokio::task::JoinHandle<()>,
-    pub cancel: CancellationToken,
-}
-pub struct HttpEndpointListener(pub CancellableTask);
-pub struct LibpqEndpointListener(pub CancellableTask);
-pub struct ConsumptionMetricsTasks(pub CancellableTask);
-pub struct DiskUsageEvictionTask(pub CancellableTask);
-impl CancellableTask {
-    pub async fn shutdown(self) {
-        self.cancel.cancel();
-        self.task.await.unwrap();
-    }
-}
-
 #[tracing::instrument(skip_all, fields(%exit_code))]
-#[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
-    http_listener: HttpEndpointListener,
-    libpq_listener: LibpqEndpointListener,
-    consumption_metrics_worker: ConsumptionMetricsTasks,
-    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
-    background_purges: BackgroundPurges,
    mut deletion_queue: DeletionQueue,
-    secondary_controller_tasks: secondary::GlobalTasks,
    exit_code: i32,
 ) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
    timed(
-        libpq_listener.0.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -117,44 +91,16 @@ pub async fn shutdown_pageserver(
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    deletion_queue.shutdown(Duration::from_secs(5)).await;

-    timed(
-        consumption_metrics_worker.0.shutdown(),
-        "shutdown consumption metrics",
-        Duration::from_secs(1),
-    )
-    .await;
-
-    timed(
-        futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())),
-        "shutdown disk usage eviction",
-        Duration::from_secs(1),
-    )
-    .await;
-
-    timed(
-        background_purges.shutdown(),
-        "shutdown background purges",
-        Duration::from_secs(1),
-    )
-    .await;
-
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
    timed(
-        http_listener.0.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
        "shutdown http",
        Duration::from_secs(1),
    )
    .await;

-    timed(
-        secondary_controller_tasks.wait(), // cancellation happened in caller
-        "secondary controller wait",
-        Duration::from_secs(1),
-    )
-    .await;
-
    // There should be nothing left, but let's be sure
    timed(
        task_mgr::shutdown_tasks(None, None, None),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3104,8 +3104,6 @@ pub fn preinitialize_metrics() {
        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
        &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
-        &CIRCUIT_BREAKERS_BROKEN,
-        &CIRCUIT_BREAKERS_UNBROKEN,
    ]
    .into_iter()
    .for_each(|c| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -124,6 +124,7 @@ pub async fn libpq_listener_main(
                    None,
                    None,
                    "serving compute connection task",
+                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
                        local_auth,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -284,16 +284,6 @@ impl Timeline {
        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
            return Ok(true);
        }
-        // then check if the database was already initialized.
-        // get_rel_exists can be called before dbdir is created.
-        let buf = version.get(self, DBDIR_KEY, ctx).await?;
-        let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.dbdirs),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }?;
-        if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
-            return Ok(false);
-        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
        let buf = version.get(self, key, ctx).await?;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -408,6 +408,7 @@ pub fn spawn<F>(
    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    name: &str,
+    shutdown_process_on_error: bool,
    future: F,
 ) -> PageserverTaskId
 where
@@ -436,6 +437,7 @@ where
        task_id,
        task_cloned,
        cancel,
+        shutdown_process_on_error,
        future,
    ));
    task_mut.join_handle = Some(join_handle);
@@ -452,78 +454,82 @@ async fn task_wrapper<F>(
    task_id: u64,
    task: Arc<PageServerTask>,
    shutdown_token: CancellationToken,
+    shutdown_process_on_error: bool,
    future: F,
 ) where
    F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
    debug!("Starting task '{}'", task_name);

-    // wrap the future so we log panics and errors
-    let tenant_shard_id = task.tenant_shard_id;
-    let timeline_id = task.timeline_id;
-    let fut = async move {
-        // We use AssertUnwindSafe here so that the payload function
-        // doesn't need to be UnwindSafe. We don't do anything after the
-        // unwinding that would expose us to unwind-unsafe behavior.
-        let result = AssertUnwindSafe(future).catch_unwind().await;
+    let result = SHUTDOWN_TOKEN
+        .scope(
+            shutdown_token,
+            CURRENT_TASK.scope(task, {
+                // We use AssertUnwindSafe here so that the payload function
+                // doesn't need to be UnwindSafe. We don't do anything after the
+                // unwinding that would expose us to unwind-unsafe behavior.
+                AssertUnwindSafe(future).catch_unwind()
+            }),
+        )
+        .await;
+    task_finish(result, task_name, task_id, shutdown_process_on_error).await;
+}
+
+async fn task_finish(
+    result: std::result::Result<
+        anyhow::Result<()>,
+        std::boxed::Box<dyn std::any::Any + std::marker::Send>,
+    >,
+    task_name: String,
+    task_id: u64,
+    shutdown_process_on_error: bool,
+) {
+    // Remove our entry from the global hashmap.
+    let task = TASKS
+        .lock()
+        .unwrap()
+        .remove(&task_id)
+        .expect("no task in registry");
+
+    let mut shutdown_process = false;
+    {
        match result {
            Ok(Ok(())) => {
                debug!("Task '{}' exited normally", task_name);
            }
            Ok(Err(err)) => {
-                error!(
-                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                    task_name, tenant_shard_id, timeline_id, err
-                );
+                if shutdown_process_on_error {
+                    error!(
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                    shutdown_process = true;
+                } else {
+                    error!(
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                }
            }
            Err(err) => {
-                error!(
-                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                    task_name, tenant_shard_id, timeline_id, err
-                );
+                if shutdown_process_on_error {
+                    error!(
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                    shutdown_process = true;
+                } else {
+                    error!(
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                }
            }
        }
-    };
+    }

-    // add the task-locals
-    let fut = CURRENT_TASK.scope(task, fut);
-    let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut);
-
-    // poll future to completion
-    fut.await;
-
-    // Remove our entry from the global hashmap.
-    TASKS
-        .lock()
-        .unwrap()
-        .remove(&task_id)
-        .expect("no task in registry");
-}
-
-pub async fn exit_on_panic_or_error<T, E>(
-    task_name: &'static str,
-    future: impl Future<Output = Result<T, E>>,
-) -> T
-where
-    E: std::fmt::Debug,
-{
-    // We use AssertUnwindSafe here so that the payload function
-    // doesn't need to be UnwindSafe. We don't do anything after the
-    // unwinding that would expose us to unwind-unsafe behavior.
-    let result = AssertUnwindSafe(future).catch_unwind().await;
-    match result {
-        Ok(Ok(val)) => val,
-        Ok(Err(err)) => {
-            error!(
-                task_name,
-                "Task exited with error, exiting process: {err:?}"
-            );
-            std::process::exit(1);
-        }
-        Err(panic_obj) => {
-            error!(task_name, "Task panicked, exiting process: {panic_obj:?}");
-            std::process::exit(1);
-        }
+    if shutdown_process {
+        std::process::exit(1);
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,7 +31,6 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
-use std::collections::BTreeMap;
 use std::fmt;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -96,12 +95,14 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
+use std::collections::BTreeSet;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
+use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
@@ -720,6 +721,7 @@ impl Tenant {
            Some(tenant_shard_id),
            None,
            "attach tenant",
+            false,
            async move {

                info!(
@@ -1620,7 +1622,7 @@ impl Tenant {
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<(), timeline::CompactionError> {
+    ) -> anyhow::Result<(), timeline::CompactionError> {
        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
        if !self.is_active() {
            return Ok(());
@@ -1665,14 +1667,12 @@ impl Tenant {
                .compact(cancel, EnumSet::empty(), ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
                .await
-                .inspect_err(|e| match e {
-                    timeline::CompactionError::ShuttingDown => (),
-                    timeline::CompactionError::Other(e) => {
-                        self.compaction_circuit_breaker
-                            .lock()
-                            .unwrap()
-                            .fail(&CIRCUIT_BREAKERS_BROKEN, e);
-                    }
+                .map_err(|e| {
+                    self.compaction_circuit_breaker
+                        .lock()
+                        .unwrap()
+                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
+                    e
                })?;
        }

@@ -1766,9 +1766,6 @@ impl Tenant {
                .values()
                .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));

-            // Before activation, populate each Timeline's GcInfo with information about its children
-            self.initialize_gc_info(&timelines_accessor);
-
            // Spawn gc and compaction loops. The loops will shut themselves
            // down when they notice that the tenant is inactive.
            tasks::start_background_loops(self, background_jobs_can_start);
@@ -2802,55 +2799,6 @@ impl Tenant {
            .await
    }

-    /// Populate all Timelines' `GcInfo` with information about their children.  We do not set the
-    /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
-    ///
-    /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
-    fn initialize_gc_info(
-        &self,
-        timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
-    ) {
-        // This function must be called before activation: after activation timeline create/delete operations
-        // might happen, and this function is not safe to run concurrently with those.
-        assert!(!self.is_active());
-
-        // Scan all timelines. For each timeline, remember the timeline ID and
-        // the branch point where it was created.
-        let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> = BTreeMap::new();
-        timelines.iter().for_each(|(timeline_id, timeline_entry)| {
-            if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
-                let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id));
-            }
-        });
-
-        // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
-        let horizon = self.get_gc_horizon();
-
-        // Populate each timeline's GcInfo with information about its child branches
-        for timeline in timelines.values() {
-            let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
-                .remove(&timeline.timeline_id)
-                .unwrap_or_default();
-
-            branchpoints.sort_by_key(|b| b.0);
-
-            let mut target = timeline.gc_info.write().unwrap();
-
-            target.retain_lsns = branchpoints;
-
-            let space_cutoff = timeline
-                .get_last_record_lsn()
-                .checked_sub(horizon)
-                .unwrap_or(Lsn(0));
-
-            target.cutoffs = GcCutoffs {
-                space: space_cutoff,
-                time: Lsn::INVALID,
-            };
-        }
-    }
-
    async fn refresh_gc_info_internal(
        &self,
        target_timeline_id: Option<TimelineId>,
@@ -2873,11 +2821,6 @@ impl Tenant {
            .cloned()
            .collect::<Vec<_>>();

-        if target_timeline_id.is_some() && timelines.is_empty() {
-            // We were to act on a particular timeline and it wasn't found
-            return Err(GcError::TimelineNotFound);
-        }
-
        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
            HashMap::with_capacity(timelines.len());

@@ -2900,63 +2843,68 @@ impl Tenant {
        // because that will stall branch creation.
        let gc_cs = self.gc_cs.lock().await;

-        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
-        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
-        // and fail out if it's inaccurate.
-        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
-        {
-            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
-                BTreeMap::new();
-            timelines.iter().for_each(|timeline| {
-                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
-                    let ancestor_children =
-                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
-                }
-            });
-
-            for timeline in &timelines {
-                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
-                    .remove(&timeline.timeline_id)
-                    .unwrap_or_default();
-
-                branchpoints.sort_by_key(|b| b.0);
-
-                let target = timeline.gc_info.read().unwrap();
-
-                // We require that retain_lsns contains everything in `branchpoints`, but not that
-                // they are exactly equal: timeline deletions can race with us, so retain_lsns
-                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
-                // just means that we retain slightly more data than we otherwise might.
-                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
-                for b in &branchpoints {
-                    if !have_branchpoints.contains(b) {
-                        tracing::error!(
-                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
-                            branchpoints,
-                            target.retain_lsns
-                        );
-                        debug_assert!(false);
-                        // Do not GC based on bad information!
-                        // (ab-use an existing GcError type rather than adding a new one, since this is a
-                        // "should never happen" check that will be removed soon).
-                        return Err(GcError::Remote(anyhow::anyhow!(
-                            "retain_lsns failed validation!"
-                        )));
+        // Scan all timelines. For each timeline, remember the timeline ID and
+        // the branch point where it was created.
+        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
+            let timelines = self.timelines.lock().unwrap();
+            let mut all_branchpoints = BTreeSet::new();
+            let timelines = {
+                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
+                    if timelines.get(target_timeline_id).is_none() {
+                        return Err(GcError::TimelineNotFound);
                    }
-                }
-            }
-        }
+                };
+
+                timelines
+                    .iter()
+                    .map(|(_timeline_id, timeline_entry)| {
+                        if let Some(ancestor_timeline_id) =
+                            &timeline_entry.get_ancestor_timeline_id()
+                        {
+                            // If target_timeline is specified, we only need to know branchpoints of its children
+                            if let Some(timeline_id) = target_timeline_id {
+                                if ancestor_timeline_id == &timeline_id {
+                                    all_branchpoints.insert((
+                                        *ancestor_timeline_id,
+                                        timeline_entry.get_ancestor_lsn(),
+                                    ));
+                                }
+                            }
+                            // Collect branchpoints for all timelines
+                            else {
+                                all_branchpoints.insert((
+                                    *ancestor_timeline_id,
+                                    timeline_entry.get_ancestor_lsn(),
+                                ));
+                            }
+                        }
+
+                        timeline_entry.clone()
+                    })
+                    .collect::<Vec<_>>()
+            };
+            (all_branchpoints, timelines)
+        };

        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
        let mut gc_timelines = Vec::with_capacity(timelines.len());
        for timeline in timelines {
-            // We filtered the timeline list above
+            // If target_timeline is specified, ignore all other timelines
            if let Some(target_timeline_id) = target_timeline_id {
-                assert_eq!(target_timeline_id, timeline.timeline_id);
+                if timeline.timeline_id != target_timeline_id {
+                    continue;
+                }
            }

+            let branchpoints: Vec<Lsn> = all_branchpoints
+                .range((
+                    Included((timeline.timeline_id, Lsn(0))),
+                    Included((timeline.timeline_id, Lsn(u64::MAX))),
+                ))
+                .map(|&x| x.1)
+                .collect();
+
            {
                let mut target = timeline.gc_info.write().unwrap();

@@ -2994,12 +2942,20 @@ impl Tenant {
                        .0,
                );

-                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
-                // - this timeline was created while we were finding cutoffs
-                // - lsn for timestamp search fails for this timeline repeatedly
-                if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
-                    target.cutoffs = cutoffs.clone();
-                }
+                match gc_cutoffs.remove(&timeline.timeline_id) {
+                    Some(cutoffs) => {
+                        target.retain_lsns = branchpoints;
+                        target.cutoffs = cutoffs;
+                    }
+                    None => {
+                        // reasons for this being unavailable:
+                        // - this timeline was created while we were finding cutoffs
+                        // - lsn for timestamp search fails for this timeline repeatedly
+                        //
+                        // in both cases, refreshing the branchpoints is correct.
+                        target.retain_lsns = branchpoints;
+                    }
+                };
            }

            gc_timelines.push(timeline);
@@ -4037,7 +3993,6 @@ mod tests {
    use storage_layer::PersistentLayerKey;
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    use timeline::{DeltaLayerTestDesc, GcInfo};
    use utils::bin_ser::BeSer;
    use utils::id::TenantId;
@@ -4388,7 +4343,7 @@ mod tests {
        {
            let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
            assert_eq!(branchpoints.len(), 1);
-            assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID));
+            assert_eq!(branchpoints[0], Lsn(0x40));
        }

        // You can read the key from the child branch even though the parent is
@@ -4570,7 +4525,7 @@ mod tests {
        let layer_map = tline.layers.read().await;
        let level0_deltas = layer_map
            .layer_map()
-            .get_level0_deltas()
+            .get_level0_deltas()?
            .into_iter()
            .map(|desc| layer_map.get_from_desc(&desc))
            .collect::<Vec<_>>();
@@ -5789,7 +5744,7 @@ mod tests {
            .read()
            .await
            .layer_map()
-            .get_level0_deltas()
+            .get_level0_deltas()?
            .len();

        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
@@ -5799,7 +5754,7 @@ mod tests {
            .read()
            .await
            .layer_map()
-            .get_level0_deltas()
+            .get_level0_deltas()?
            .len();

        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
@@ -7260,323 +7215,4 @@ mod tests {

        Ok(())
    }
-
-    #[tokio::test]
-    async fn test_generate_key_retention() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_generate_key_retention").await?;
-        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-        tline.force_advance_lsn(Lsn(0x70));
-        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
-        let history = vec![
-            (
-                key,
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"0x10")),
-            ),
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x50),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
-            ),
-            (
-                key,
-                Lsn(0x60),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-            (
-                key,
-                Lsn(0x80),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
-            ),
-            (
-                key,
-                Lsn(0x90),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
-                3,
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![
-                (
-                    Lsn(0x20),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x20),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
-                    )]),
-                ),
-                (
-                    Lsn(0x40),
-                    KeyLogAtLsn(vec![
-                        (
-                            Lsn(0x30),
-                            Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
-                        ),
-                        (
-                            Lsn(0x40),
-                            Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-                        ),
-                    ]),
-                ),
-                (
-                    Lsn(0x50),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x50),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")),
-                    )]),
-                ),
-                (
-                    Lsn(0x60),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x60),
-                        Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
-                    )]),
-                ),
-            ],
-            above_horizon: KeyLogAtLsn(vec![
-                (
-                    Lsn(0x70),
-                    Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-                ),
-                (
-                    Lsn(0x80),
-                    Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
-                ),
-                (
-                    Lsn(0x90),
-                    Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
-                ),
-            ]),
-        };
-        assert_eq!(res, expected_res);
-        // TODO: more tests with mixed image + delta, adding with k-merge test cases; e2e compaction test
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
-        let harness =
-            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
-            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x28),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(5),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(6),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-        ];
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![
-                    (Lsn(0x10), tline.timeline_id),
-                    (Lsn(0x20), tline.timeline_id),
-                ],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
-                    space: Lsn(0x30),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        let expected_result = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
-        ];
-
-        let expected_result_at_gc_horizon = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let expected_result_at_lsn_20 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let expected_result_at_lsn_10 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let verify_result = || async {
-            for idx in 0..10 {
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_gc_horizon[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_20[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_10[idx]
-                );
-            }
-        };
-
-        verify_result().await;
-
-        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-
-        verify_result().await;
-
-        Ok(())
-    }
 }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -463,7 +463,7 @@ impl LayerMap {
    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

@@ -482,7 +482,7 @@ impl LayerMap {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -598,9 +598,8 @@ impl LayerMap {
        coverage
    }

-    /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>) -> bool {
-        key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
+        layer.get_key_range() == (Key::MIN..Key::MAX)
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -627,7 +626,7 @@ impl LayerMap {
    ///      than just the current partition_range.
    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
-        if !Self::is_l0(&layer.key_range) {
+        if !Self::is_l0(layer) {
            return true;
        }

@@ -845,8 +844,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
-        self.l0_delta_layers.to_vec()
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
+        Ok(self.l0_delta_layers.to_vec())
    }

    /// debugging function to print out the contents of the layer map
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -36,7 +36,7 @@ use crate::control_plane_client::{
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
-use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
@@ -225,98 +225,26 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
    Ok(tmp_path)
 }

-/// See [`Self::spawn`].
-#[derive(Clone)]
-pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
-enum BackgroundPurgesInner {
-    Open(tokio::task::JoinSet<()>),
-    // we use the async mutex for coalescing
-    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
-}
+/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+/// the background, and thereby avoid blocking any API requests on this deletion completing.
+fn spawn_background_purge(tmp_path: Utf8PathBuf) {
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;

-impl Default for BackgroundPurges {
-    fn default() -> Self {
-        Self(Arc::new(std::sync::Mutex::new(
-            BackgroundPurgesInner::Open(JoinSet::new()),
-        )))
-    }
-}
-
-impl BackgroundPurges {
-    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-    /// the background, and thereby avoid blocking any API requests on this deletion completing.
-    ///
-    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
-    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        let mut guard = self.0.lock().unwrap();
-        let jset = match &mut *guard {
-            BackgroundPurgesInner::Open(ref mut jset) => jset,
-            BackgroundPurgesInner::ShuttingDown(_) => {
-                warn!("trying to spawn background purge during shutdown, ignoring");
-                return;
-            }
-        };
-        jset.spawn_on(
-            async move {
-                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
-                    // should we fatal_io_error here?
-                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
-                }
-            }
-            .instrument(info_span!(parent: None, "background_purge")),
-            BACKGROUND_RUNTIME.handle(),
-        );
-    }
-
-    /// When this future completes, all background purges have completed.
-    /// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`].
-    ///
-    /// Concurrent calls will coalesce.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// If this future is dropped before polled to completion, concurrent and subsequent
-    /// instances of this future will continue to be correct.
-    #[instrument(skip_all)]
-    pub async fn shutdown(&self) {
-        let jset = {
-            let mut guard = self.0.lock().unwrap();
-            match &mut *guard {
-                BackgroundPurgesInner::Open(jset) => {
-                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
-                        std::mem::take(jset),
-                    )))
-                }
-                BackgroundPurgesInner::ShuttingDown(_) => {
-                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
-                    warn!("already shutting down");
-                }
-            };
-            match &mut *guard {
-                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
-                BackgroundPurgesInner::Open(_) => {
-                    unreachable!("above code transitions into shut down state");
-                }
-            }
-        };
-        let mut jset = jset.lock().await; // concurrent callers coalesce here
-        while let Some(res) = jset.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(e) if e.is_panic() => {
-                    // If it panicked, the error is already logged by the panic hook.
-                }
-                Err(e) if e.is_cancelled() => {
-                    unreachable!("we don't cancel the joinset or runtime")
-                }
-                Err(e) => {
-                    // No idea when this can happen, but let's log it.
-                    warn!(%e, "background purge task failed or panicked");
-                }
-            }
-        }
-    }
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
 }

 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
@@ -342,8 +270,6 @@ pub struct TenantManager {
    // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
    // when the tenant detaches.
    cancel: CancellationToken,
-
-    background_purges: BackgroundPurges,
 }

 fn emergency_generations(
@@ -521,7 +447,6 @@ pub(crate) enum DeleteTenantError {
 #[instrument(skip_all)]
 pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
-    background_purges: BackgroundPurges,
    resources: TenantSharedResources,
    init_order: InitializationOrder,
    cancel: CancellationToken,
@@ -587,7 +512,7 @@ pub async fn init_tenant_mgr(

                    match safe_rename_tenant_dir(&tenant_dir_path).await {
                        Ok(tmp_path) => {
-                            background_purges.spawn(tmp_path);
+                            spawn_background_purge(tmp_path);
                        }
                        Err(e) => {
                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
@@ -709,7 +634,6 @@ pub async fn init_tenant_mgr(
        tenants: &TENANTS,
        resources,
        cancel: CancellationToken::new(),
-        background_purges,
    })
 }

@@ -1429,7 +1353,6 @@ impl TenantManager {

        async fn delete_local(
            conf: &PageServerConf,
-            background_purges: &BackgroundPurges,
            tenant_shard_id: &TenantShardId,
        ) -> anyhow::Result<()> {
            let local_tenant_directory = conf.tenant_path(tenant_shard_id);
@@ -1438,7 +1361,7 @@ impl TenantManager {
                .with_context(|| {
                    format!("local tenant directory {local_tenant_directory:?} rename")
                })?;
-            background_purges.spawn(tmp_dir);
+            spawn_background_purge(tmp_dir);
            Ok(())
        }

@@ -1456,12 +1379,12 @@ impl TenantManager {
                        barrier.wait().await;
                    }
                }
-                delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
+                delete_local(self.conf, &tenant_shard_id).await?;
            }
            Some(TenantSlot::Secondary(secondary_tenant)) => {
                secondary_tenant.shutdown().await;

-                delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
+                delete_local(self.conf, &tenant_shard_id).await?;
            }
            Some(TenantSlot::InProgress(_)) => unreachable!(),
            None => {}
@@ -1732,7 +1655,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        self.background_purges.spawn(tmp_path);
+        spawn_background_purge(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1908,7 +1831,7 @@ impl TenantManager {
        let tmp_path = self
            .detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
            .await?;
-        self.background_purges.spawn(tmp_path);
+        spawn_background_purge(tmp_path);

        Ok(())
    }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -287,14 +287,6 @@ pub enum PersistIndexPartWithDeletedFlagError {
    Other(#[from] anyhow::Error),
 }

-#[derive(Debug, thiserror::Error)]
-pub enum WaitCompletionError {
-    #[error(transparent)]
-    NotInitialized(NotInitialized),
-    #[error("wait_completion aborted because upload queue was stopped")]
-    UploadQueueShutDownOrStopped,
-}
-
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -638,7 +630,7 @@ impl RemoteTimelineClient {
    ///
    /// Like schedule_index_upload_for_metadata_update(), this merely adds
    /// the upload to the upload queue and returns quickly.
-    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> Result<(), NotInitialized> {
+    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

@@ -653,7 +645,7 @@ impl RemoteTimelineClient {
    fn schedule_index_upload(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-    ) -> Result<(), NotInitialized> {
+    ) -> anyhow::Result<()> {
        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
        // fix up the duplicated field
        upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
@@ -661,7 +653,7 @@ impl RemoteTimelineClient {
        // make sure it serializes before doing it in perform_upload_task so that it doesn't
        // look like a retryable error
        let void = std::io::sink();
-        serde_json::to_writer(void, &upload_queue.dirty).expect("serialize index_part.json");
+        serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;

        let index_part = &upload_queue.dirty;

@@ -707,9 +699,7 @@ impl RemoteTimelineClient {
            self.schedule_barrier0(upload_queue)
        };

-        Self::wait_completion0(receiver)
-            .await
-            .context("wait completion")
+        Self::wait_completion0(receiver).await
    }

    /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -742,9 +732,7 @@ impl RemoteTimelineClient {
            barrier
        };

-        Self::wait_completion0(barrier)
-            .await
-            .context("wait completion")
+        Self::wait_completion0(barrier).await
    }

    /// Launch an upload operation in the background; the file is added to be included in next
@@ -752,7 +740,7 @@ impl RemoteTimelineClient {
    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
        layer: ResidentLayer,
-    ) -> Result<(), NotInitialized> {
+    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

@@ -838,7 +826,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
-    ) -> Result<Vec<(LayerName, LayerFileMetadata)>, NotInitialized>
+    ) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
    where
        I: IntoIterator<Item = LayerName>,
    {
@@ -964,7 +952,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        compacted_from: &[Layer],
        compacted_to: &[ResidentLayer],
-    ) -> Result<(), NotInitialized> {
+    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

@@ -981,12 +969,10 @@ impl RemoteTimelineClient {
    }

    /// Wait for all previously scheduled uploads/deletions to complete
-    pub(crate) async fn wait_completion(self: &Arc<Self>) -> Result<(), WaitCompletionError> {
+    pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
        let receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard
-                .initialized_mut()
-                .map_err(WaitCompletionError::NotInitialized)?;
+            let upload_queue = guard.initialized_mut()?;
            self.schedule_barrier0(upload_queue)
        };

@@ -995,9 +981,9 @@ impl RemoteTimelineClient {

    async fn wait_completion0(
        mut receiver: tokio::sync::watch::Receiver<()>,
-    ) -> Result<(), WaitCompletionError> {
+    ) -> anyhow::Result<()> {
        if receiver.changed().await.is_err() {
-            return Err(WaitCompletionError::UploadQueueShutDownOrStopped);
+            anyhow::bail!("wait_completion aborted because upload queue was stopped");
        }

        Ok(())
@@ -1539,6 +1525,7 @@ impl RemoteTimelineClient {
                Some(self.tenant_shard_id),
                Some(self.timeline_id),
                "remote upload",
+                false,
                async move {
                    self_rc.perform_upload_task(task).await;
                    Ok(())
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -31,7 +31,6 @@ use pageserver_api::{
 };
 use remote_storage::GenericRemoteStorage;

-use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
@@ -294,50 +293,15 @@ impl SecondaryController {
    }
 }

-pub struct GlobalTasks {
-    cancel: CancellationToken,
-    uploader: JoinHandle<()>,
-    downloader: JoinHandle<()>,
-}
-
-impl GlobalTasks {
-    /// Caller is responsible for requesting shutdown via the cancellation token that was
-    /// passed to [`spawn_tasks`].
-    ///
-    /// # Panics
-    ///
-    /// This method panics if that token is not cancelled.
-    /// This is low-risk because we're calling this during process shutdown, so, a panic
-    /// will be informative but not cause undue downtime.
-    pub async fn wait(self) {
-        let Self {
-            cancel,
-            uploader,
-            downloader,
-        } = self;
-        assert!(
-            cancel.is_cancelled(),
-            "must cancel cancellation token, otherwise the tasks will not shut down"
-        );
-
-        let (uploader, downloader) = futures::future::join(uploader, downloader).await;
-        uploader.expect(
-            "unreachable: exit_on_panic_or_error would catch the panic and exit the process",
-        );
-        downloader.expect(
-            "unreachable: exit_on_panic_or_error would catch the panic and exit the process",
-        );
-    }
-}
-
 pub fn spawn_tasks(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
    background_jobs_can_start: Barrier,
    cancel: CancellationToken,
-) -> (SecondaryController, GlobalTasks) {
+) -> SecondaryController {
    let mgr_clone = tenant_manager.clone();
    let storage_clone = remote_storage.clone();
+    let cancel_clone = cancel.clone();
    let bg_jobs_clone = background_jobs_can_start.clone();

    let (download_req_tx, download_req_rx) =
@@ -345,9 +309,17 @@ pub fn spawn_tasks(
    let (upload_req_tx, upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);

-    let cancel_clone = cancel.clone();
-    let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+    let downloader_task_ctx = RequestContext::new(
+        TaskKind::SecondaryDownloads,
+        crate::context::DownloadBehavior::Download,
+    );
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        downloader_task_ctx.task_kind(),
+        None,
+        None,
        "secondary tenant downloads",
+        false,
        async move {
            downloader_task(
                mgr_clone,
@@ -355,41 +327,49 @@ pub fn spawn_tasks(
                download_req_rx,
                bg_jobs_clone,
                cancel_clone,
-                RequestContext::new(
-                    TaskKind::SecondaryDownloads,
-                    crate::context::DownloadBehavior::Download,
-                ),
+                downloader_task_ctx,
            )
            .await;
-            anyhow::Ok(())
-        },
-    ));

-    let cancel_clone = cancel.clone();
-    let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            Ok(())
+        },
+    );
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryUploads,
+        None,
+        None,
        "heatmap uploads",
+        false,
        async move {
            heatmap_uploader_task(
                tenant_manager,
                remote_storage,
                upload_req_rx,
                background_jobs_can_start,
-                cancel_clone,
+                cancel,
            )
            .await;
-            anyhow::Ok(())
-        },
-    ));

-    (
-        SecondaryController {
-            upload_req_tx,
-            download_req_tx,
+            Ok(())
        },
-        GlobalTasks {
-            cancel,
-            uploader,
-            downloader,
-        },
-    )
+    );
+
+    SecondaryController {
+        download_req_tx,
+        upload_req_tx,
+    }
+}
+
+/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
+pub fn null_controller() -> SecondaryController {
+    let (download_req_tx, _download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
+    let (upload_req_tx, _upload_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
+    SecondaryController {
+        upload_req_tx,
+        download_req_tx,
+    }
 }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -264,10 +264,10 @@ pub(super) async fn gather_inputs(
        let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
            .retain_lsns
            .iter()
-            .filter(|(lsn, _child_id)| lsn > &ancestor_lsn)
+            .filter(|&&lsn| lsn > ancestor_lsn)
            .copied()
            // this assumes there are no other retain_lsns than the branchpoints
-            .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint))
+            .map(|lsn| (lsn, LsnKind::BranchPoint))
            .collect::<Vec<_>>();

        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -10,18 +10,29 @@ pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
+use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
+use enum_map::EnumMap;
+use enumset::EnumSet;
+use once_cell::sync::Lazy;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
+use pageserver_api::models::{
+    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+};
+use std::borrow::Cow;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tracing::warn;
+use utils::history_buffer::HistoryBufferWithDropCounter;
+use utils::rate_limit::RateLimit;

-use utils::lsn::Lsn;
+use utils::{id::TimelineId, lsn::Lsn};

 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
@@ -64,9 +75,9 @@ where
 /// call, to collect more records.
 ///
 #[derive(Debug, Default)]
-pub(crate) struct ValueReconstructState {
-    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
-    pub(crate) img: Option<(Lsn, Bytes)>,
+pub struct ValueReconstructState {
+    pub records: Vec<(Lsn, NeonWalRecord)>,
+    pub img: Option<(Lsn, Bytes)>,
 }

 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
@@ -447,92 +458,94 @@ pub enum ValueReconstructResult {
    Missing,
 }

-/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
-/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
-/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
-/// be used for cache management but not for correctness-critical checks.
-#[derive(Default, Debug, Clone, PartialEq, Eq)]
-pub(crate) enum LayerVisibilityHint {
-    /// A Visible layer might be read while serving a read, because there is not an image layer between it
-    /// and a readable LSN (the tip of the branch or a child's branch point)
-    Visible,
-    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
-    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
-    #[allow(unused)]
-    Covered,
-    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
-    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
-    /// state is for when existing layers are constructed while loading a timeline.
-    #[default]
-    Uninitialized,
+#[derive(Debug)]
+pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
+
+/// This struct holds two instances of [`LayerAccessStatsInner`].
+/// Accesses are recorded to both instances.
+/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
+/// The `for_eviction_policy` is never reset.
+#[derive(Debug, Default, Clone)]
+struct LayerAccessStatsLocked {
+    for_scraping_api: LayerAccessStatsInner,
+    for_eviction_policy: LayerAccessStatsInner,
 }

-pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
+impl LayerAccessStatsLocked {
+    fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
+        [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+struct LayerAccessStatsInner {
+    first_access: Option<LayerAccessStatFullDetails>,
+    count_by_access_kind: EnumMap<LayerAccessKind, u64>,
+    task_kind_flag: EnumSet<TaskKind>,
+    last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
+    last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct LayerAccessStatFullDetails {
+    pub(crate) when: SystemTime,
+    pub(crate) task_kind: TaskKind,
+    pub(crate) access_kind: LayerAccessKind,
+}

 #[derive(Clone, Copy, strum_macros::EnumString)]
-pub(crate) enum LayerAccessStatsReset {
+pub enum LayerAccessStatsReset {
    NoReset,
+    JustTaskKindFlags,
    AllStats,
 }

-impl Default for LayerAccessStats {
-    fn default() -> Self {
-        // Default value is to assume resident since creation time, and visible.
-        let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now());
-        value |= 0x1 << Self::VISIBILITY_SHIFT;
+fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
+    ts.duration_since(UNIX_EPOCH)
+        .expect("better to die in this unlikely case than report false stats")
+        .as_millis()
+        .try_into()
+        .expect("64 bits is enough for few more years")
+}

-        Self(std::sync::atomic::AtomicU64::new(value))
+impl LayerAccessStatFullDetails {
+    fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
+        let Self {
+            when,
+            task_kind,
+            access_kind,
+        } = self;
+        pageserver_api::models::LayerAccessStatFullDetails {
+            when_millis_since_epoch: system_time_to_millis_since_epoch(when),
+            task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
+            access_kind: *access_kind,
+        }
    }
 }

-// Efficient store of two very-low-resolution timestamps and some bits.  Used for storing last access time and
-// last residence change time.
 impl LayerAccessStats {
-    // How many high bits to drop from a u32 timestamp?
-    // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use
-    //   after that, this software has been very successful!)
-    // - Dropping the top bit is implicitly safe because unix timestamps are meant to be
-    // stored in an i32, so they never used it.
-    // - Dropping the next two bits is safe because this code is only running on systems in
-    // years >= 2024, and these bits have been 1 since 2021
-    //
-    // Therefore we may store only 28 bits for a timestamp with one second resolution.  We do
-    // this truncation to make space for some flags in the high bits of our u64.
-    const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1;
-    const TS_MASK: u32 = 0x1f_ff_ff_ff;
-    const TS_ONES: u32 = 0x60_00_00_00;
-
-    const ATIME_SHIFT: u32 = 0;
-    const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS;
-    const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS;
-
-    fn write_bits(&self, mask: u64, value: u64) -> u64 {
-        self.0
-            .fetch_update(
-                // TODO: decide what orderings are correct
-                std::sync::atomic::Ordering::Relaxed,
-                std::sync::atomic::Ordering::Relaxed,
-                |v| Some((v & !mask) | (value & mask)),
-            )
-            .expect("Inner function is infallible")
+    /// Create an empty stats object.
+    ///
+    /// The caller is responsible for recording a residence event
+    /// using [`record_residence_event`] before calling `latest_activity`.
+    /// If they don't, [`latest_activity`] will return `None`.
+    ///
+    /// [`record_residence_event`]: Self::record_residence_event
+    /// [`latest_activity`]: Self::latest_activity
+    pub(crate) fn empty_will_record_residence_event_later() -> Self {
+        LayerAccessStats(Mutex::default())
    }

-    fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) {
-        // Drop the low three bits of the timestamp, for an ~8s accuracy
-        let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64);
-
-        ((Self::TS_MASK as u64) << shift, timestamp << shift)
-    }
-
-    fn read_low_res_timestamp(&self, shift: u32) -> Option<SystemTime> {
-        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-
-        let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift;
-        if ts_bits == 0 {
-            None
-        } else {
-            Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64)))
-        }
+    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    ///
+    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
+    /// [`record_residence_event`]: Self::record_residence_event
+    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
+        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
+        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
+        new
    }

    /// Record a change in layer residency.
@@ -548,64 +561,117 @@ impl LayerAccessStats {
    /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
-    pub(crate) fn record_residence_event_at(&self, now: SystemTime) {
-        let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now);
-        self.write_bits(mask, value);
+    ///
+    pub(crate) fn record_residence_event(
+        &self,
+        status: LayerResidenceStatus,
+        reason: LayerResidenceEventReason,
+    ) {
+        let mut locked = self.0.lock().unwrap();
+        locked.iter_mut().for_each(|inner| {
+            inner
+                .last_residence_changes
+                .write(LayerResidenceEvent::new(status, reason))
+        });
    }

-    pub(crate) fn record_residence_event(&self) {
-        self.record_residence_event_at(SystemTime::now())
-    }
-
-    pub(crate) fn record_access_at(&self, now: SystemTime) {
-        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
-
-        // A layer which is accessed must be visible.
-        mask |= 0x1 << Self::VISIBILITY_SHIFT;
-        value |= 0x1 << Self::VISIBILITY_SHIFT;
-
-        self.write_bits(mask, value);
-    }
-
-    pub(crate) fn record_access(&self, ctx: &RequestContext) {
+    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
            return;
        }

-        self.record_access_at(SystemTime::now())
+        let this_access = LayerAccessStatFullDetails {
+            when: SystemTime::now(),
+            task_kind: ctx.task_kind(),
+            access_kind,
+        };
+
+        let mut locked = self.0.lock().unwrap();
+        locked.iter_mut().for_each(|inner| {
+            inner.first_access.get_or_insert(this_access);
+            inner.count_by_access_kind[access_kind] += 1;
+            inner.task_kind_flag |= ctx.task_kind();
+            inner.last_accesses.write(this_access);
+        })
    }

    fn as_api_model(
        &self,
        reset: LayerAccessStatsReset,
    ) -> pageserver_api::models::LayerAccessStats {
+        let mut locked = self.0.lock().unwrap();
+        let inner = &mut locked.for_scraping_api;
+        let LayerAccessStatsInner {
+            first_access,
+            count_by_access_kind,
+            task_kind_flag,
+            last_accesses,
+            last_residence_changes,
+        } = inner;
        let ret = pageserver_api::models::LayerAccessStats {
-            access_time: self
-                .read_low_res_timestamp(Self::ATIME_SHIFT)
-                .unwrap_or(UNIX_EPOCH),
-            residence_time: self
-                .read_low_res_timestamp(Self::RTIME_SHIFT)
-                .unwrap_or(UNIX_EPOCH),
-            visible: matches!(self.visibility(), LayerVisibilityHint::Visible),
+            access_count_by_access_kind: count_by_access_kind
+                .iter()
+                .map(|(kind, count)| (kind, *count))
+                .collect(),
+            task_kind_access_flag: task_kind_flag
+                .iter()
+                .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
+                .collect(),
+            first: first_access.as_ref().map(|a| a.as_api_model()),
+            accesses_history: last_accesses.map(|m| m.as_api_model()),
+            residence_events_history: last_residence_changes.clone(),
        };
        match reset {
-            LayerAccessStatsReset::NoReset => {}
+            LayerAccessStatsReset::NoReset => (),
+            LayerAccessStatsReset::JustTaskKindFlags => {
+                inner.task_kind_flag.clear();
+            }
            LayerAccessStatsReset::AllStats => {
-                self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0);
-                self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0);
+                *inner = LayerAccessStatsInner::default();
            }
        }
        ret
    }

-    /// Get the latest access timestamp, falling back to latest residence event.  The latest residence event
-    /// will be this Layer's construction time, if its residence hasn't changed since then.
-    pub(crate) fn latest_activity(&self) -> SystemTime {
-        if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) {
-            t
-        } else {
-            self.read_low_res_timestamp(Self::RTIME_SHIFT)
-                .expect("Residence time is set on construction")
+    /// Get the latest access timestamp, falling back to latest residence event, further falling
+    /// back to `SystemTime::now` for a usable timestamp for eviction.
+    pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
+        self.latest_activity().unwrap_or_else(SystemTime::now)
+    }
+
+    /// Get the latest access timestamp, falling back to latest residence event.
+    ///
+    /// This function can only return `None` if there has not yet been a call to the
+    /// [`record_residence_event`] method. That would generally be considered an
+    /// implementation error. This function logs a rate-limited warning in that case.
+    ///
+    /// TODO: use type system to avoid the need for `fallback`.
+    /// The approach in <https://github.com/neondatabase/neon/pull/3775>
+    /// could be used to enforce that a residence event is recorded
+    /// before a layer is added to the layer map. We could also have
+    /// a layer wrapper type that holds the LayerAccessStats, and ensure
+    /// that that type can only be produced by inserting into the layer map.
+    ///
+    /// [`record_residence_event`]: Self::record_residence_event
+    fn latest_activity(&self) -> Option<SystemTime> {
+        let locked = self.0.lock().unwrap();
+        let inner = &locked.for_eviction_policy;
+        match inner.last_accesses.recent() {
+            Some(a) => Some(a.when),
+            None => match inner.last_residence_changes.recent() {
+                Some(e) => Some(e.timestamp),
+                None => {
+                    static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
+                        Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
+                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
+                    guard.0 += 1;
+                    let occurences = guard.0;
+                    guard.1.call(move || {
+                        warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
+                    });
+                    None
+                }
+            },
        }
    }

@@ -614,46 +680,30 @@ impl LayerAccessStats {
    /// This indicates whether the layer has been used for some purpose that would motivate
    /// us to keep it on disk, such as for serving a getpage request.
    fn accessed(&self) -> bool {
+        let locked = self.0.lock().unwrap();
+        let inner = &locked.for_eviction_policy;
+
        // Consider it accessed if the most recent access is more recent than
        // the most recent change in residence status.
        match (
-            self.read_low_res_timestamp(Self::ATIME_SHIFT),
-            self.read_low_res_timestamp(Self::RTIME_SHIFT),
+            inner.last_accesses.recent(),
+            inner.last_residence_changes.recent(),
        ) {
            (None, _) => false,
            (Some(_), None) => true,
-            (Some(a), Some(r)) => a >= r,
-        }
-    }
-
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let value = match visibility {
-            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
-            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
-        };
-
-        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
-    }
-
-    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
-            1 => LayerVisibilityHint::Visible,
-            0 => LayerVisibilityHint::Covered,
-            _ => unreachable!(),
+            (Some(a), Some(r)) => a.when >= r.timestamp,
        }
    }
 }

 /// Get a layer descriptor from a layer.
-pub(crate) trait AsLayerDesc {
+pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

 pub mod tests {
    use pageserver_api::shard::TenantShardId;
-    use utils::id::TimelineId;

    use super::*;

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -52,7 +52,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::ImageCompressionAlgorithm;
+use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -265,7 +265,7 @@ impl DeltaLayer {
            return Ok(());
        }

-        let inner = self.load(ctx).await?;
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

        inner.dump(ctx).await
    }
@@ -298,8 +298,12 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(ctx);
+    async fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<&Arc<DeltaLayerInner>> {
+        self.access_stats.record_access(access_kind, ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -352,7 +356,7 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
-            access_stats: Default::default(),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: OnceCell::new(),
        })
    }
@@ -456,12 +460,7 @@ impl DeltaLayerWriterInner {
        will_init: bool,
        ctx: &RequestContext,
    ) -> (Vec<u8>, anyhow::Result<()>) {
-        assert!(
-            self.lsn_range.start <= lsn,
-            "lsn_start={}, lsn={}",
-            self.lsn_range.start,
-            lsn
-        );
+        assert!(self.lsn_range.start <= lsn);
        // We don't want to use compression in delta layer creation
        let compression = ImageCompressionAlgorithm::Disabled;
        let (val, res) = self
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -49,6 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
+use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -227,7 +228,7 @@ impl ImageLayer {
            return Ok(());
        }

-        let inner = self.load(ctx).await?;
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

        inner.dump(ctx).await?;

@@ -254,8 +255,12 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(ctx);
+    async fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(access_kind, ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -307,7 +312,7 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
-            access_stats: Default::default(),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: OnceCell::new(),
        })
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -18,7 +18,7 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BinaryHeap, HashSet};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -375,6 +375,15 @@ impl InMemoryLayer {
        let inner = self.inner.read().await;
        let reader = inner.file.block_cursor();

+        #[derive(Eq, PartialEq, Ord, PartialOrd)]
+        struct BlockRead {
+            key: Key,
+            lsn: Lsn,
+            block_offset: u64,
+        }
+
+        let mut planned_block_reads = BinaryHeap::new();
+
        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner.index.range(range.start..range.end) {
                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
@@ -383,32 +392,49 @@ impl InMemoryLayer {
                };

                let slice = vec_map.slice_range(lsn_range);
-
                for (entry_lsn, pos) in slice.iter().rev() {
-                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-                    let buf = reader.read_blob(*pos, &ctx).await;
-                    if let Err(e) = buf {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
-                        break;
-                    }
-
-                    let value = Value::des(&buf.unwrap());
-                    if let Err(e) = value {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
-                        break;
-                    }
-
-                    let key_situation =
-                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
-                    if key_situation == ValueReconstructSituation::Complete {
-                        break;
-                    }
+                    planned_block_reads.push(BlockRead {
+                        key: *key,
+                        lsn: *entry_lsn,
+                        block_offset: *pos,
+                    });
                }
            }
        }

+        let keyspace_size = keyspace.total_raw_size();
+
+        let mut completed_keys = HashSet::new();
+        while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
+            let block_read = planned_block_reads.pop().unwrap();
+            if completed_keys.contains(&block_read.key) {
+                continue;
+            }
+
+            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
+            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
+            if let Err(e) = buf {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let value = Value::des(&buf.unwrap());
+            if let Err(e) = value {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let key_situation =
+                reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
+            if key_situation == ValueReconstructSituation::Complete {
+                completed_keys.insert(block_read.key);
+            }
+        }
+
        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);

        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1,7 +1,9 @@
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::HistoricLayerInfo;
+use pageserver_api::models::{
+    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
+};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
@@ -17,7 +19,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::task_mgr::TaskKind;
-use crate::tenant::timeline::{CompactionError, GetVectoredError};
+use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
@@ -158,10 +160,13 @@ impl Layer {
            metadata.file_size,
        );

+        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
+
        let owner = Layer(Arc::new(LayerInner::new(
            conf,
            timeline,
            local_path,
+            access_stats,
            desc,
            None,
            metadata.generation,
@@ -188,6 +193,8 @@ impl Layer {
            metadata.file_size,
        );

+        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
+
        let mut resident = None;

        let owner = Layer(Arc::new_cyclic(|owner| {
@@ -202,6 +209,7 @@ impl Layer {
                conf,
                timeline,
                local_path,
+                access_stats,
                desc,
                Some(inner),
                metadata.generation,
@@ -237,6 +245,11 @@ impl Layer {
                version: 0,
            });
            resident = Some(inner.clone());
+            let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
+            access_stats.record_residence_event(
+                LayerResidenceStatus::Resident,
+                LayerResidenceEventReason::LayerCreate,
+            );

            let local_path = local_layer_path(
                conf,
@@ -246,22 +259,16 @@ impl Layer {
                &timeline.generation,
            );

-            let layer = LayerInner::new(
+            LayerInner::new(
                conf,
                timeline,
                local_path,
+                access_stats,
                desc,
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
-            );
-
-            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
-            layer
-                .access_stats
-                .set_visibility(super::LayerVisibilityHint::Visible);
-
-            layer
+            )
        }));

        let downloaded = resident.expect("just initialized");
@@ -325,7 +332,9 @@ impl Layer {
        use anyhow::ensure;

        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0.access_stats.record_access(ctx);
+        self.0
+            .access_stats
+            .record_access(LayerAccessKind::GetValueReconstructData, ctx);

        if self.layer_desc().is_delta {
            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
@@ -359,7 +368,9 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.0.access_stats.record_access(ctx);
+        self.0
+            .access_stats
+            .record_access(LayerAccessKind::GetValueReconstructData, ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -426,7 +437,7 @@ impl Layer {
    }

    /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
-    pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
+    pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
        let downloaded = self.0.get_or_maybe_download(true, None).await?;

        Ok(ResidentLayer {
@@ -775,6 +786,7 @@ impl LayerInner {
        conf: &'static PageServerConf,
        timeline: &Arc<Timeline>,
        local_path: Utf8PathBuf,
+        access_stats: LayerAccessStats,
        desc: PersistentLayerDesc,
        downloaded: Option<Arc<DownloadedLayer>>,
        generation: Generation,
@@ -809,7 +821,7 @@ impl LayerInner {
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
-            access_stats: Default::default(),
+            access_stats,
            wanted_deleted: AtomicBool::new(false),
            inner,
            version: AtomicUsize::new(version),
@@ -1164,7 +1176,10 @@ impl LayerInner {
                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                }

-                self.access_stats.record_residence_event();
+                self.access_stats.record_residence_event(
+                    LayerResidenceStatus::Resident,
+                    LayerResidenceEventReason::ResidenceChange,
+                );

                Ok(self.initialize_after_layer_is_on_disk(permit))
            }
@@ -1283,7 +1298,7 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
@@ -1518,7 +1533,10 @@ impl LayerInner {
            }
        }

-        self.access_stats.record_residence_event();
+        self.access_stats.record_residence_event(
+            LayerResidenceStatus::Evicted,
+            LayerResidenceEventReason::ResidenceChange,
+        );

        self.status.as_ref().unwrap().send_replace(Status::Evicted);

@@ -1844,7 +1862,9 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                owner.access_stats.record_access(ctx);
+                owner
+                    .access_stats
+                    .record_access(LayerAccessKind::KeyIter, ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
@@ -1862,24 +1882,12 @@ impl ResidentLayer {
        shard_identity: &ShardIdentity,
        writer: &mut ImageLayerWriter,
        ctx: &RequestContext,
-    ) -> Result<usize, CompactionError> {
+    ) -> anyhow::Result<usize> {
        use LayerKind::*;

-        match self
-            .downloaded
-            .get(&self.owner.0, ctx)
-            .await
-            .map_err(CompactionError::Other)?
-        {
-            Delta(_) => {
-                return Err(CompactionError::Other(anyhow::anyhow!(format!(
-                    "cannot filter() on a delta layer {self}"
-                ))));
-            }
-            Image(i) => i
-                .filter(shard_identity, writer, ctx)
-                .await
-                .map_err(CompactionError::Other),
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
+            Image(i) => i.filter(shard_identity, writer, ctx).await,
        }
    }

--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,5 +1,3 @@
-use std::time::UNIX_EPOCH;
-
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
 use utils::{
@@ -9,7 +7,7 @@ use utils::{

 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
+use crate::context::DownloadBehavior;
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};

 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -828,9 +826,9 @@ async fn eviction_cancellation_on_drop() {
 #[test]
 #[cfg(target_arch = "x86_64")]
 fn layer_size() {
-    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 312);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
    // it also has the utf8 path
 }

@@ -970,46 +968,3 @@ fn spawn_blocking_pool_helper_actually_works() {
        println!("joined");
    });
 }
-
-/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats
-fn lowres_time(hires: SystemTime) -> SystemTime {
-    let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs();
-    UNIX_EPOCH + Duration::from_secs(ts)
-}
-
-#[test]
-fn access_stats() {
-    let access_stats = LayerAccessStats::default();
-    // Default is visible
-    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
-
-    access_stats.set_visibility(LayerVisibilityHint::Covered);
-    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
-    access_stats.set_visibility(LayerVisibilityHint::Visible);
-    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
-
-    let rtime = UNIX_EPOCH + Duration::from_secs(2000000000);
-    access_stats.record_residence_event_at(rtime);
-    assert_eq!(access_stats.latest_activity(), lowres_time(rtime));
-
-    let atime = UNIX_EPOCH + Duration::from_secs(2100000000);
-    access_stats.record_access_at(atime);
-    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
-
-    // Setting visibility doesn't clobber access time
-    access_stats.set_visibility(LayerVisibilityHint::Covered);
-    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
-    access_stats.set_visibility(LayerVisibilityHint::Visible);
-    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
-}
-
-#[test]
-fn access_stats_2038() {
-    // The access stats structure uses a timestamp representation that will run out
-    // of bits in 2038.  One year before that, this unit test will start failing.
-
-    let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap()
-        + Duration::from_secs(3600 * 24 * 365);
-
-    assert!(one_year_from_now.as_secs() < (2 << 31));
-}
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -248,14 +248,6 @@ impl LayerName {
            Image(_) => "image",
        }
    }
-
-    /// Gets the key range encoded in the layer name.
-    pub fn key_range(&self) -> &Range<Key> {
-        match &self {
-            LayerName::Image(layer) => &layer.key_range,
-            LayerName::Delta(layer) => &layer.key_range,
-        }
-    }
 }

 impl fmt::Display for LayerName {
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -101,6 +101,7 @@ pub fn start_background_loops(
        Some(tenant_shard_id),
        None,
        &format!("compactor for tenant {tenant_shard_id}"),
+        false,
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -124,6 +125,7 @@ pub fn start_background_loops(
        Some(tenant_shard_id),
        None,
        &format!("garbage collector for tenant {tenant_shard_id}"),
+        false,
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -147,6 +149,7 @@ pub fn start_background_loops(
        Some(tenant_shard_id),
        None,
        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
+        false,
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,5 @@
 pub(crate) mod analysis;
-pub(crate) mod compaction;
+mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
@@ -460,7 +460,7 @@ pub(crate) struct GcInfo {
    /// Currently, this includes all points where child branches have
    /// been forked off from. In the future, could also include
    /// explicit user-defined snapshot points.
-    pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>,
+    pub(crate) retain_lsns: Vec<Lsn>,

    /// The cutoff coordinates, which are combined by selecting the minimum.
    pub(crate) cutoffs: GcCutoffs,
@@ -476,21 +476,12 @@ impl GcInfo {
    pub(crate) fn min_cutoff(&self) -> Lsn {
        self.cutoffs.select_min()
    }
-
-    pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
-        self.retain_lsns.push((child_lsn, child_id));
-        self.retain_lsns.sort_by_key(|i| i.0);
-    }
-
-    pub(super) fn remove_child(&mut self, child_id: TimelineId) {
-        self.retain_lsns.retain(|i| i.1 != child_id);
-    }
 }

 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
 /// is a single number (the oldest LSN which we must retain), but it internally distinguishes
 /// between time-based and space-based retention for observability and consumption metrics purposes.
-#[derive(Debug, Clone)]
+#[derive(Debug)]
 pub(crate) struct GcCutoffs {
    /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
    /// history we must keep to retain a specified number of bytes of WAL.
@@ -2316,11 +2307,6 @@ impl Timeline {
            )
        };

-        if let Some(ancestor) = &ancestor {
-            let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
-            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn());
-        }
-
        Arc::new_cyclic(|myself| {
            let metrics = TimelineMetrics::new(
                &tenant_shard_id,
@@ -2491,6 +2477,7 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "layer flush task",
+            false,
            async move {
                let _guard = guard;
                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
@@ -2835,6 +2822,7 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "initial size calculation",
+            false,
            // NB: don't log errors here, task_mgr will do that.
            async move {
                let cancel = task_mgr::shutdown_token();
@@ -3003,6 +2991,7 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "ondemand logical size calculation",
+            false,
            async move {
                let res = self_clone
                    .logical_size_calculation_task(lsn, cause, &ctx)
@@ -3169,7 +3158,7 @@ impl Timeline {
        let guard = self.layers.read().await;

        let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.access_stats().latest_activity();
+            let last_activity_ts = layer.access_stats().latest_activity_or_now();

            HeatMapLayer::new(
                layer.layer_desc().layer_name(),
@@ -4767,18 +4756,6 @@ impl Timeline {
    }
 }

-impl Drop for Timeline {
-    fn drop(&mut self) {
-        if let Some(ancestor) = &self.ancestor_timeline {
-            // This lock should never be poisoned, but in case it is we do a .map() instead of
-            // an unwrap(), to avoid panicking in a destructor and thereby aborting the process.
-            if let Ok(mut gc_info) = ancestor.gc_info.write() {
-                gc_info.remove_child(self.timeline_id)
-            }
-        }
-    }
-}
-
 /// Top-level failure to compact.
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CompactionError {
@@ -4786,7 +4763,7 @@ pub(crate) enum CompactionError {
    ShuttingDown,
    /// Compaction cannot be done right now; page reconstruction and so on.
    #[error(transparent)]
-    Other(anyhow::Error),
+    Other(#[from] anyhow::Error),
 }

 impl From<CollectKeySpaceError> for CompactionError {
@@ -4801,38 +4778,6 @@ impl From<CollectKeySpaceError> for CompactionError {
    }
 }

-impl From<super::upload_queue::NotInitialized> for CompactionError {
-    fn from(value: super::upload_queue::NotInitialized) -> Self {
-        match value {
-            super::upload_queue::NotInitialized::Uninitialized
-            | super::upload_queue::NotInitialized::Stopped => {
-                CompactionError::Other(anyhow::anyhow!(value))
-            }
-            super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown,
-        }
-    }
-}
-
-impl CompactionError {
-    /// We cannot do compaction because we could not download a layer that is input to the compaction.
-    pub(crate) fn input_layer_download_failed(
-        e: super::storage_layer::layer::DownloadError,
-    ) -> Self {
-        match e {
-            super::storage_layer::layer::DownloadError::TimelineShutdown |
-            /* TODO DownloadCancelled correct here? */
-            super::storage_layer::layer::DownloadError::DownloadCancelled  => CompactionError::ShuttingDown,
-            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads |
-            super::storage_layer::layer::DownloadError::DownloadRequired |
-            super::storage_layer::layer::DownloadError::NotFile(_) |
-            super::storage_layer::layer::DownloadError::DownloadFailed |
-            super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)),
-            #[cfg(test)]
-            super::storage_layer::layer::DownloadError::Failpoint(_) =>  CompactionError::Other(anyhow::anyhow!(e)),
-        }
-    }
-}
-
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -4906,7 +4851,7 @@ impl Timeline {
        new_deltas: &[ResidentLayer],
        new_images: &[ResidentLayer],
        layers_to_remove: &[Layer],
-    ) -> Result<(), CompactionError> {
+    ) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;

        let mut duplicated_layers = HashSet::new();
@@ -4923,8 +4868,8 @@ impl Timeline {
                // for compact_level0_phase1 creating an L0, which does not happen in practice
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
-                return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
+            } else if LayerMap::is_l0(l.layer_desc()) {
+                bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
            } else {
                insert_layers.push(l.clone());
            }
@@ -4956,7 +4901,7 @@ impl Timeline {
        self: &Arc<Self>,
        mut replace_layers: Vec<(Layer, ResidentLayer)>,
        mut drop_layers: Vec<Layer>,
-    ) -> Result<(), super::upload_queue::NotInitialized> {
+    ) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;

        // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
@@ -4978,7 +4923,7 @@ impl Timeline {
    fn upload_new_image_layers(
        self: &Arc<Self>,
        new_images: impl IntoIterator<Item = ResidentLayer>,
-    ) -> Result<(), super::upload_queue::NotInitialized> {
+    ) -> anyhow::Result<()> {
        for layer in new_images {
            self.remote_client.schedule_layer_file_upload(layer)?;
        }
@@ -5128,11 +5073,7 @@ impl Timeline {

            let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
            let time_cutoff = gc_info.cutoffs.time;
-            let retain_lsns = gc_info
-                .retain_lsns
-                .iter()
-                .map(|(lsn, _child_id)| *lsn)
-                .collect();
+            let retain_lsns = gc_info.retain_lsns.clone();

            // Gets the maximum LSN that holds the valid lease.
            //
@@ -5494,6 +5435,7 @@ impl Timeline {
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "download all remote layers task",
+            false,
            async move {
                self_clone.download_all_remote_layers(request).await;
                let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
@@ -5644,7 +5586,7 @@ impl Timeline {
                let file_size = layer.layer_desc().file_size;
                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));

-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();

                EvictionCandidate {
                    layer: layer.into(),
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -27,9 +27,8 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
-use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
+use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
@@ -37,7 +36,7 @@ use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 use crate::keyspace::KeySpace;
-use crate::repository::{Key, Value};
+use crate::repository::Key;

 use utils::lsn::Lsn;

@@ -46,60 +45,6 @@ use pageserver_compaction::interface::*;

 use super::CompactionError;

-/// Maximum number of deltas before generating an image layer in bottom-most compaction.
-const COMPACTION_DELTA_THRESHOLD: usize = 5;
-
-/// The result of bottom-most compaction for a single key at each LSN.
-#[derive(Debug)]
-#[cfg_attr(test, derive(PartialEq))]
-pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>);
-
-/// The result of bottom-most compaction.
-#[derive(Debug)]
-#[cfg_attr(test, derive(PartialEq))]
-pub(crate) struct KeyHistoryRetention {
-    /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN.
-    pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>,
-    /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN.
-    pub(crate) above_horizon: KeyLogAtLsn,
-}
-
-impl KeyHistoryRetention {
-    async fn pipe_to(
-        self,
-        key: Key,
-        delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        image_writer: &mut ImageLayerWriter,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let mut first_batch = true;
-        for (_, KeyLogAtLsn(logs)) in self.below_horizon {
-            if first_batch {
-                if logs.len() == 1 && logs[0].1.is_image() {
-                    let Value::Image(img) = &logs[0].1 else {
-                        unreachable!()
-                    };
-                    image_writer.put_image(key, img.clone(), ctx).await?;
-                } else {
-                    for (lsn, val) in logs {
-                        delta_writer.push((key, lsn, val));
-                    }
-                }
-                first_batch = false;
-            } else {
-                for (lsn, val) in logs {
-                    delta_writer.push((key, lsn, val));
-                }
-            }
-        }
-        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
-        for (lsn, val) in above_horizon_logs {
-            delta_writer.push((key, lsn, val));
-        }
-        Ok(())
-    }
-}
-
 impl Timeline {
    /// TODO: cancellation
    pub(crate) async fn compact_legacy(
@@ -109,10 +54,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<(), CompactionError> {
        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            return self
-                .compact_with_gc(cancel, ctx)
-                .await
-                .map_err(CompactionError::Other);
+            return self.compact_with_gc(cancel, ctx).await;
        }

        // High level strategy for compaction / image creation:
@@ -240,7 +182,7 @@ impl Timeline {
        self: &Arc<Self>,
        rewrite_max: usize,
        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> anyhow::Result<()> {
        let mut drop_layers = Vec::new();
        let mut layers_to_rewrite: Vec<Layer> = Vec::new();

@@ -361,8 +303,7 @@ impl Timeline {
                layer.layer_desc().image_layer_lsn(),
                ctx,
            )
-            .await
-            .map_err(CompactionError::Other)?;
+            .await?;

            // Safety of layer rewrites:
            // - We are writing to a different local file path than we are reading from, so the old Layer
@@ -377,20 +318,14 @@ impl Timeline {
            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
            //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer
-                .download_and_keep_resident()
-                .await
-                .map_err(CompactionError::input_layer_download_failed)?;
+            let resident = layer.download_and_keep_resident().await?;

            let keys_written = resident
                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
                .await?;

            if keys_written > 0 {
-                let new_layer = image_layer_writer
-                    .finish(self, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?;
+                let new_layer = image_layer_writer.finish(self, ctx).await?;
                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                    layer.metadata().file_size,
                    new_layer.metadata().file_size);
@@ -418,13 +353,7 @@ impl Timeline {
        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
        // load.
-        match self.remote_client.wait_completion().await {
-            Ok(()) => (),
-            Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
-            Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                return Err(CompactionError::ShuttingDown)
-            }
-        }
+        self.remote_client.wait_completion().await?;

        fail::fail_point!("compact-shard-ancestors-persistent");

@@ -482,7 +411,7 @@ impl Timeline {
        stats.read_lock_held_spawn_blocking_startup_micros =
            stats.read_lock_acquisition_micros.till_now(); // set by caller
        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas();
+        let level0_deltas = layers.get_level0_deltas()?;
        let mut level0_deltas = level0_deltas
            .into_iter()
            .map(|x| guard.get_from_desc(&x))
@@ -535,23 +464,14 @@ impl Timeline {
        ) as u64
            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);

-        deltas_to_compact.push(
-            first_level0_delta
-                .download_and_keep_resident()
-                .await
-                .map_err(CompactionError::input_layer_download_failed)?,
-        );
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;

            if lsn_range.start != prev_lsn_end {
                break;
            }
-            deltas_to_compact.push(
-                l.download_and_keep_resident()
-                    .await
-                    .map_err(CompactionError::input_layer_download_failed)?,
-            );
+            deltas_to_compact.push(l.download_and_keep_resident().await?);
            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;

@@ -610,7 +530,7 @@ impl Timeline {
        let mut all_keys = Vec::new();

        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+            all_keys.extend(l.load_keys(ctx).await?);
        }

        // FIXME: should spawn_blocking the rest of this function
@@ -732,7 +652,7 @@ impl Timeline {
            key, lsn, ref val, ..
        } in all_values_iter
        {
-            let value = val.load(ctx).await.map_err(CompactionError::Other)?;
+            let value = val.load(ctx).await?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
@@ -789,8 +709,7 @@ impl Timeline {
                                .take()
                                .unwrap()
                                .finish(prev_key.unwrap().next(), self, ctx)
-                                .await
-                                .map_err(CompactionError::Other)?,
+                                .await?,
                        );
                        writer = None;

@@ -828,8 +747,7 @@ impl Timeline {
                            },
                            ctx,
                        )
-                        .await
-                        .map_err(CompactionError::Other)?,
+                        .await?,
                    );
                }

@@ -837,8 +755,7 @@ impl Timeline {
                    .as_mut()
                    .unwrap()
                    .put_value(key, lsn, value, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?;
+                    .await?;
            } else {
                debug!(
                    "Dropping key {} during compaction (it belongs on shard {:?})",
@@ -854,12 +771,7 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(
-                writer
-                    .finish(prev_key.unwrap().next(), self, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?,
-            );
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?);
        }

        // Sync layers
@@ -1041,7 +953,7 @@ impl Timeline {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();

-            let l0_deltas = layers.get_level0_deltas();
+            let l0_deltas = layers.get_level0_deltas()?;
            drop(guard);

            // As an optimization, if we find that there are too few L0 layers,
@@ -1071,196 +983,12 @@ impl Timeline {
            fanout,
            ctx,
        )
-        .await
-        // TODO: compact_tiered needs to return CompactionError
-        .map_err(CompactionError::Other)?;
+        .await?;

        adaptor.flush_updates().await?;
        Ok(())
    }

-    /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns.
-    ///
-    /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon.
-    /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is
-    /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch.
-    ///
-    /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have:
-    ///
-    /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60
-    /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3
-    ///
-    /// The function will produce:
-    ///
-    /// ```plain
-    /// 0x20(retain_lsn) -> img=AB@0x20                  always produce a single image below the lowest retain LSN
-    /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40]    two deltas since the last base image, keeping the deltas
-    /// 0x50(horizon)    -> deltas=[ABCDE@0x50]          three deltas since the last base image, generate an image but put it in the delta
-    /// above_horizon    -> deltas=[+F@0x60]             full history above the horizon
-    /// ```
-    ///
-    /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
-    pub(crate) async fn generate_key_retention(
-        self: &Arc<Timeline>,
-        key: Key,
-        history: &[(Key, Lsn, Value)],
-        horizon: Lsn,
-        retain_lsn_below_horizon: &[Lsn],
-        delta_threshold_cnt: usize,
-    ) -> anyhow::Result<KeyHistoryRetention> {
-        // Pre-checks for the invariants
-        if cfg!(debug_assertions) {
-            for (log_key, _, _) in history {
-                assert_eq!(log_key, &key, "mismatched key");
-            }
-            for i in 1..history.len() {
-                assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
-                if history[i - 1].1 == history[i].1 {
-                    assert!(
-                        matches!(history[i - 1].2, Value::Image(_)),
-                        "unordered delta/image, or duplicated delta"
-                    );
-                }
-            }
-            if let Value::WalRecord(rec) = &history[0].2 {
-                assert!(rec.will_init(), "no base image");
-            }
-            for lsn in retain_lsn_below_horizon {
-                assert!(lsn < &horizon, "retain lsn must be below horizon")
-            }
-            for i in 1..retain_lsn_below_horizon.len() {
-                assert!(
-                    retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i],
-                    "unordered LSN"
-                );
-            }
-        }
-        // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
-        // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
-        let (mut split_history, lsn_split_points) = {
-            let mut split_history = Vec::new();
-            split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new);
-            let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1);
-            for lsn in retain_lsn_below_horizon {
-                lsn_split_points.push(*lsn);
-            }
-            lsn_split_points.push(horizon);
-            let mut current_idx = 0;
-            for item @ (_, lsn, _) in history {
-                while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
-                    current_idx += 1;
-                }
-                split_history[current_idx].push(item);
-            }
-            (split_history, lsn_split_points)
-        };
-        // Step 2: filter out duplicated records due to the k-merge of image/delta layers
-        for split_for_lsn in &mut split_history {
-            let mut prev_lsn = None;
-            let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len());
-            for record @ (_, lsn, _) in std::mem::take(split_for_lsn) {
-                if let Some(prev_lsn) = &prev_lsn {
-                    if *prev_lsn == lsn {
-                        // The case that we have an LSN with both data from the delta layer and the image layer. As
-                        // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
-                        // drop this delta and keep the image.
-                        //
-                        // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
-                        // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
-                        // dropped.
-                        continue;
-                    }
-                }
-                prev_lsn = Some(lsn);
-                new_split_for_lsn.push(record);
-            }
-            *split_for_lsn = new_split_for_lsn;
-        }
-        // Step 3: generate images when necessary
-        let mut retention = Vec::with_capacity(split_history.len());
-        let mut records_since_last_image = 0;
-        let batch_cnt = split_history.len();
-        assert!(
-            batch_cnt >= 2,
-            "should have at least below + above horizon batches"
-        );
-        let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
-        for (i, split_for_lsn) in split_history.into_iter().enumerate() {
-            records_since_last_image += split_for_lsn.len();
-            let generate_image = if i == 0 {
-                // We always generate images for the first batch (below horizon / lowest retain_lsn)
-                true
-            } else if i == batch_cnt - 1 {
-                // Do not generate images for the last batch (above horizon)
-                false
-            } else if records_since_last_image >= delta_threshold_cnt {
-                // Generate images when there are too many records
-                true
-            } else {
-                false
-            };
-            replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
-            if let Some((_, _, val)) = replay_history.first() {
-                assert!(val.will_init(), "invalid history, no base image");
-            }
-            // Only retain the items after the last image record
-            for idx in (0..replay_history.len()).rev() {
-                if replay_history[idx].2.will_init() {
-                    replay_history = replay_history[idx..].to_vec();
-                    break;
-                }
-            }
-            if generate_image && records_since_last_image > 0 {
-                records_since_last_image = 0;
-                let history = std::mem::take(&mut replay_history);
-                let mut img = None;
-                let mut records = Vec::with_capacity(history.len());
-                if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
-                    img = Some((*lsn, val.clone()));
-                    for (_, lsn, val) in history.into_iter().skip(1) {
-                        let Value::WalRecord(rec) = val else {
-                            panic!("invalid record")
-                        };
-                        records.push((lsn, rec));
-                    }
-                } else {
-                    for (_, lsn, val) in history.into_iter() {
-                        let Value::WalRecord(rec) = val else {
-                            panic!("invalid record")
-                        };
-                        records.push((lsn, rec));
-                    }
-                }
-                records.reverse();
-                let state = ValueReconstructState { img, records };
-                let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
-                let img = self.reconstruct_value(key, request_lsn, state).await?;
-                replay_history.push((key, request_lsn, Value::Image(img.clone())));
-                retention.push(vec![(request_lsn, Value::Image(img))]);
-            } else {
-                retention.push(
-                    split_for_lsn
-                        .iter()
-                        .map(|(_, lsn, value)| (*lsn, value.clone()))
-                        .collect(),
-                );
-            }
-        }
-        let mut result = Vec::with_capacity(retention.len());
-        assert_eq!(retention.len(), lsn_split_points.len() + 1);
-        for (idx, logs) in retention.into_iter().enumerate() {
-            if idx == lsn_split_points.len() {
-                return Ok(KeyHistoryRetention {
-                    below_horizon: result,
-                    above_horizon: KeyLogAtLsn(logs),
-                });
-            } else {
-                result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
-            }
-        }
-        unreachable!()
-    }
-
    /// An experimental compaction building block that combines compaction with garbage collection.
    ///
    /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1271,7 +999,8 @@ impl Timeline {
        self: &Arc<Self>,
        _cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
+        use crate::tenant::storage_layer::ValueReconstructState;
        use std::collections::BTreeSet;

        info!("running enhanced gc bottom-most compaction");
@@ -1284,51 +1013,30 @@ impl Timeline {
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
+        let (layer_selection, gc_cutoff) = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let gc_info = self.gc_info.read().unwrap();
-            let mut retain_lsns_below_horizon = Vec::new();
+            if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
+                return Err(CompactionError::Other(anyhow!(
+                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
+                )));
+            }
            let gc_cutoff = gc_info.cutoffs.select_min();
-            for (lsn, _timeline_id) in &gc_info.retain_lsns {
-                if lsn < &gc_cutoff {
-                    retain_lsns_below_horizon.push(*lsn);
-                }
-            }
-            for lsn in gc_info.leases.keys() {
-                if lsn < &gc_cutoff {
-                    retain_lsns_below_horizon.push(*lsn);
-                }
-            }
            let mut selected_layers = Vec::new();
+            // TODO: consider retain_lsns
            drop(gc_info);
            for desc in layers.iter_historic_layers() {
                if desc.get_lsn_range().start <= gc_cutoff {
                    selected_layers.push(guard.get_from_desc(&desc));
                }
            }
-            retain_lsns_below_horizon.sort();
-            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
+            (selected_layers, gc_cutoff)
        };
-        let lowest_retain_lsn = retain_lsns_below_horizon
-            .first()
-            .copied()
-            .unwrap_or(gc_cutoff);
-        if cfg!(debug_assertions) {
-            assert_eq!(
-                lowest_retain_lsn,
-                retain_lsns_below_horizon
-                    .iter()
-                    .min()
-                    .copied()
-                    .unwrap_or(gc_cutoff)
-            );
-        }
        info!(
-            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
+            "picked {} layers for compaction with gc_cutoff={}",
            layer_selection.len(),
-            gc_cutoff,
-            lowest_retain_lsn
+            gc_cutoff
        );
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
        // Also, collect the layer information to decide when to split the new delta layers.
@@ -1364,13 +1072,61 @@ impl Timeline {
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

+        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
+        async fn flush_accumulated_states(
+            tline: &Arc<Timeline>,
+            key: Key,
+            accumulated_values: &[(Key, Lsn, crate::repository::Value)],
+            horizon: Lsn,
+        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
+            let mut base_image = None;
+            let mut keys_above_horizon = Vec::new();
+            let mut delta_above_base_image = Vec::new();
+            // We have a list of deltas/images. We want to create image layers while collect garbages.
+            for (key, lsn, val) in accumulated_values.iter().rev() {
+                if *lsn > horizon {
+                    if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() {
+                        if *prev_lsn == *lsn {
+                            // The case that we have an LSN with both data from the delta layer and the image layer. As
+                            // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
+                            // drop this delta and keep the image.
+                            //
+                            // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
+                            // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
+                            // dropped.
+                            continue;
+                        }
+                    }
+                    keys_above_horizon.push((*key, *lsn, val.clone()));
+                } else if *lsn <= horizon {
+                    match val {
+                        crate::repository::Value::Image(image) => {
+                            base_image = Some((*lsn, image.clone()));
+                            break;
+                        }
+                        crate::repository::Value::WalRecord(wal) => {
+                            delta_above_base_image.push((*lsn, wal.clone()));
+                        }
+                    }
+                }
+            }
+            // do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
+            keys_above_horizon.reverse();
+            let state = ValueReconstructState {
+                img: base_image,
+                records: delta_above_base_image,
+            };
+            let img = tline.reconstruct_value(key, horizon, state).await?;
+            Ok((keys_above_horizon, img))
+        }
+
        async fn flush_deltas(
            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
            last_key: Key,
            delta_split_points: &[Key],
            current_delta_split_point: &mut usize,
            tline: &Arc<Timeline>,
-            lowest_retain_lsn: Lsn,
+            gc_cutoff: Lsn,
            ctx: &RequestContext,
        ) -> anyhow::Result<Option<ResidentLayer>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1405,7 +1161,7 @@ impl Timeline {
                tline.timeline_id,
                tline.tenant_shard_id,
                deltas.first().unwrap().0,
-                lowest_retain_lsn..end_lsn,
+                gc_cutoff..end_lsn,
                ctx,
            )
            .await?;
@@ -1422,7 +1178,7 @@ impl Timeline {
            self.timeline_id,
            self.tenant_shard_id,
            &(Key::MIN..Key::MAX), // covers the full key range
-            lowest_retain_lsn,
+            gc_cutoff,
            ctx,
        )
        .await?;
@@ -1439,19 +1195,12 @@ impl Timeline {
                accumulated_values.push((key, lsn, val));
            } else {
                let last_key = last_key.as_mut().unwrap();
-                let retention = self
-                    .generate_key_retention(
-                        *last_key,
-                        &accumulated_values,
-                        gc_cutoff,
-                        &retain_lsns_below_horizon,
-                        COMPACTION_DELTA_THRESHOLD,
-                    )
-                    .await?;
+                let (deltas, image) =
+                    flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
+                        .await?;
                // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                retention
-                    .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
-                    .await?;
+                image_layer_writer.put_image(*last_key, image, ctx).await?;
+                delta_values.extend(deltas);
                delta_layers.extend(
                    flush_deltas(
                        &mut delta_values,
@@ -1459,7 +1208,7 @@ impl Timeline {
                        &delta_split_points,
                        &mut current_delta_split_point,
                        self,
-                        lowest_retain_lsn,
+                        gc_cutoff,
                        ctx,
                    )
                    .await?,
@@ -1472,19 +1221,11 @@ impl Timeline {

        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
-        let retention = self
-            .generate_key_retention(
-                last_key,
-                &accumulated_values,
-                gc_cutoff,
-                &retain_lsns_below_horizon,
-                COMPACTION_DELTA_THRESHOLD,
-            )
-            .await?;
+        let (deltas, image) =
+            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
        // Put the image into the image layer. Currently we have a single big layer for the compaction.
-        retention
-            .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
-            .await?;
+        image_layer_writer.put_image(last_key, image, ctx).await?;
+        delta_values.extend(deltas);
        delta_layers.extend(
            flush_deltas(
                &mut delta_values,
@@ -1492,7 +1233,7 @@ impl Timeline {
                &delta_split_points,
                &mut current_delta_split_point,
                self,
-                lowest_retain_lsn,
+                gc_cutoff,
                ctx,
            )
            .await?,
@@ -1540,7 +1281,7 @@ impl TimelineAdaptor {
        }
    }

-    pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
+    pub async fn flush_updates(&mut self) -> anyhow::Result<()> {
        let layers_to_delete = {
            let guard = self.timeline.layers.read().await;
            self.layers_to_delete
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces(
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
    tenant: &Tenant,
-    timeline: &Timeline,
+    timeline_id: TimelineId,
    _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
    // Remove the timeline from the map.
    let mut timelines = tenant.timelines.lock().unwrap();
    let children_exist = timelines
        .iter()
-        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
    // We already deleted the layer files, so it's probably best to panic.
    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
@@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant(
    }

    timelines
-        .remove(&timeline.timeline_id)
+        .remove(&timeline_id)
        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");

    drop(timelines);
@@ -391,6 +391,7 @@ impl DeleteTimelineFlow {
            Some(tenant_shard_id),
            Some(timeline_id),
            "timeline_delete",
+            false,
            async move {
                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
                    error!("Error: {err:#}");
@@ -414,7 +415,7 @@ impl DeleteTimelineFlow {

        pausable_failpoint!("in_progress_delete");

-        remove_timeline_from_tenant(tenant, timeline, &guard).await?;
+        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

        *guard = Self::Finished;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -26,7 +26,7 @@ pub(crate) enum Error {
    #[error("flushing failed")]
    FlushAncestor(#[source] FlushLayerError),
    #[error("layer download failed")]
-    RewrittenDeltaDownloadFailed(#[source] crate::tenant::storage_layer::layer::DownloadError),
+    RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
    #[error("copying LSN prefix locally failed")]
    CopyDeltaPrefix(#[source] anyhow::Error),
    #[error("upload rewritten layer")]
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -65,6 +65,7 @@ impl Timeline {
                "layer eviction for {}/{}",
                self.tenant_shard_id, self.timeline_id
            ),
+            false,
            async move {
                tokio::select! {
                    _ = self_clone.cancel.cancelled() => { return Ok(()); }
@@ -225,7 +226,7 @@ impl Timeline {
                    continue;
                }

-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();

                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,14 +255,6 @@ impl LayerManager {
                new_layer.layer_desc().lsn_range
            );

-            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
-            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
-            // always marking rewritten layers as visible.
-            new_layer
-                .as_ref()
-                .access_stats()
-                .set_visibility(old_layer.access_stats().visibility());
-
            // Safety: we may never rewrite the same file in-place.  Callers are responsible
            // for ensuring that they only rewrite layers after something changes the path,
            // such as an increment in the generation number.
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -130,7 +130,7 @@ pub(super) enum UploadQueueStopped {
 }

 #[derive(thiserror::Error, Debug)]
-pub enum NotInitialized {
+pub(crate) enum NotInitialized {
    #[error("queue is in state Uninitialized")]
    Uninitialized,
    #[error("queue is in state Stopped")]
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -597,7 +597,7 @@ _PG_init(void)

 	pg_init_libpagestore();
 	pg_init_walproposer();
-	WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+        WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitLogicalReplicationMonitor();
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.4'
+default_version = '1.3'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -220,8 +220,7 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
 			return NEON_WALREAD_ERROR;
 		}
 		/* we'll poll immediately */
-		state->rem_state = RS_CONNECTING_WRITE;
-		return NEON_WALREAD_WOULDBLOCK;
+		state->rem_state = RS_CONNECTING_READ;
 	}

 	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
--- a/pgxn/neon_test_utils/neon_test_utils--1.3.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
@@ -7,12 +7,6 @@ AS 'MODULE_PATHNAME', 'test_consume_xids'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;

-CREATE FUNCTION test_consume_oids(oid int)
-RETURNS VOID
-AS 'MODULE_PATHNAME', 'test_consume_oids'
-LANGUAGE C STRICT
-PARALLEL UNSAFE;
-
 CREATE FUNCTION test_consume_cpu(seconds int)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'test_consume_cpu'
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -35,7 +35,6 @@ PG_MODULE_MAGIC;
 extern void _PG_init(void);

 PG_FUNCTION_INFO_V1(test_consume_xids);
-PG_FUNCTION_INFO_V1(test_consume_oids);
 PG_FUNCTION_INFO_V1(test_consume_cpu);
 PG_FUNCTION_INFO_V1(test_consume_memory);
 PG_FUNCTION_INFO_V1(test_release_memory);
@@ -75,21 +74,6 @@ _PG_init(void)

 #define neon_read_at_lsn neon_read_at_lsn_ptr

-/*
- * test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound.
- * Unlike test_consume_xids which is passed number of xids to be consumed,
- * this function is given the target Oid.
- */
-Datum
-test_consume_oids(PG_FUNCTION_ARGS)
-{
-	int32 oid = PG_GETARG_INT32(0);
-
-	while (oid != GetNewObjectId());
-
-	PG_RETURN_VOID();
-}
-
 /*
 * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
 */
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -717,10 +717,8 @@ mod tests {
                _ => panic!("wrong message"),
            }
        });
-        let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-            EndpointRateLimiter::DEFAULT,
-            64,
-        ));
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));

        let _creds = auth_quirks(
            &mut ctx,
@@ -769,10 +767,8 @@ mod tests {
            frontend::password_message(b"my-secret-password", &mut write).unwrap();
            client.write_all(&write).await.unwrap();
        });
-        let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-            EndpointRateLimiter::DEFAULT,
-            64,
-        ));
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));

        let _creds = auth_quirks(
            &mut ctx,
@@ -822,10 +818,8 @@ mod tests {
            client.write_all(&write).await.unwrap();
        });

-        let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-            EndpointRateLimiter::DEFAULT,
-            64,
-        ));
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));

        let creds = auth_quirks(
            &mut ctx,
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -22,9 +22,7 @@ use proxy::http;
 use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
 use proxy::rate_limiter::EndpointRateLimiter;
-use proxy::rate_limiter::LeakyBucketConfig;
 use proxy::rate_limiter::RateBucketInfo;
-use proxy::rate_limiter::WakeComputeRateLimiter;
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
@@ -178,9 +176,6 @@ struct ProxyCliArgs {
    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
    #[clap(long)]
    redis_notifications: Option<String>,
-    /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
-    #[clap(long, default_value = "irsa")]
-    redis_auth_type: String,
    /// redis host for streaming connections (might be different from the notifications host)
    #[clap(long)]
    redis_host: Option<String>,
@@ -324,38 +319,24 @@ async fn main() -> anyhow::Result<()> {
        ),
        aws_credentials_provider,
    ));
-    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
-        ("plain", redis_url) => match redis_url {
-            None => {
-                bail!("plain auth requires redis_notifications to be set");
-            }
-            Some(url) => Some(
-                ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
+    let regional_redis_client = match (args.redis_host, args.redis_port) {
+        (Some(host), Some(port)) => Some(
+            ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                host,
+                port,
+                elasticache_credentials_provider.clone(),
            ),
-        },
-        ("irsa", _) => match (&args.redis_host, args.redis_port) {
-            (Some(host), Some(port)) => Some(
-                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host.to_string(),
-                    port,
-                    elasticache_credentials_provider.clone(),
-                ),
-            ),
-            (None, None) => {
-                warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
-                None
-            }
-            _ => {
-                bail!("redis-host and redis-port must be specified together");
-            }
-        },
+        ),
+        (None, None) => {
+            warn!("Redis events from console are disabled");
+            None
+        }
        _ => {
-            bail!("unknown auth type given");
+            bail!("redis-host and redis-port must be specified together");
        }
    };
-
    let redis_notifications_client = if let Some(url) = args.redis_notifications {
-        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()))
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
    } else {
        regional_redis_client.clone()
    };
@@ -392,24 +373,9 @@ async fn main() -> anyhow::Result<()> {
        proxy::metrics::CancellationSource::FromClient,
    ));

-    // bit of a hack - find the min rps and max rps supported and turn it into
-    // leaky bucket config instead
-    let max = args
-        .endpoint_rps_limit
-        .iter()
-        .map(|x| x.rps())
-        .max_by(f64::total_cmp)
-        .unwrap_or(EndpointRateLimiter::DEFAULT.max);
-    let rps = args
-        .endpoint_rps_limit
-        .iter()
-        .map(|x| x.rps())
-        .min_by(f64::total_cmp)
-        .unwrap_or(EndpointRateLimiter::DEFAULT.rps);
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-        LeakyBucketConfig { rps, max },
-        64,
-    ));
+    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));

    // client facing tasks. these will exit on error or on cancellation
    // cancellation returns Ok(())
@@ -611,7 +577,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+                Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
            let api = console::provider::neon::Api::new(
                endpoint,
                caches,
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -12,7 +12,7 @@ use crate::{
    console::messages::{ColdStartInfo, Reason},
    http,
    metrics::{CacheOutcome, Metrics},
-    rate_limiter::WakeComputeRateLimiter,
+    rate_limiter::EndpointRateLimiter,
    scram, EndpointCacheKey,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
@@ -26,7 +26,7 @@ pub struct Api {
    endpoint: http::Endpoint,
    pub caches: &'static ApiCaches,
    pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    jwt: String,
 }

@@ -36,7 +36,7 @@ impl Api {
        endpoint: http::Endpoint,
        caches: &'static ApiCaches,
        locks: &'static ApiLocks<EndpointCacheKey>,
-        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+        wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    ) -> Self {
        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
            Ok(v) => v,
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -3,8 +3,4 @@ mod limiter;
 pub use limit_algorithm::{
    aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
-pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
-mod leaky_bucket;
-pub use leaky_bucket::{
-    EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
-};
+pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -1,171 +0,0 @@
-use std::{
-    hash::Hash,
-    sync::atomic::{AtomicUsize, Ordering},
-};
-
-use ahash::RandomState;
-use dashmap::DashMap;
-use rand::{thread_rng, Rng};
-use tokio::time::Instant;
-use tracing::info;
-
-use crate::intern::EndpointIdInt;
-
-// Simple per-endpoint rate limiter.
-pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
-
-pub struct LeakyBucketRateLimiter<Key> {
-    map: DashMap<Key, LeakyBucketState, RandomState>,
-    config: LeakyBucketConfig,
-    access_count: AtomicUsize,
-}
-
-impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
-    pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig {
-        rps: 600.0,
-        max: 1500.0,
-    };
-
-    pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
-        Self {
-            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
-            config,
-            access_count: AtomicUsize::new(0),
-        }
-    }
-
-    /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, key: K, n: u32) -> bool {
-        let now = Instant::now();
-
-        if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
-            self.do_gc(now);
-        }
-
-        let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
-            time: now,
-            filled: 0.0,
-        });
-
-        entry.check(&self.config, now, n as f64)
-    }
-
-    fn do_gc(&self, now: Instant) {
-        info!(
-            "cleaning up bucket rate limiter, current size = {}",
-            self.map.len()
-        );
-        let n = self.map.shards().len();
-        let shard = thread_rng().gen_range(0..n);
-        self.map.shards()[shard]
-            .write()
-            .retain(|_, value| !value.get_mut().update(&self.config, now));
-    }
-}
-
-pub struct LeakyBucketConfig {
-    pub rps: f64,
-    pub max: f64,
-}
-
-pub struct LeakyBucketState {
-    filled: f64,
-    time: Instant,
-}
-
-impl LeakyBucketConfig {
-    pub fn new(rps: f64, max: f64) -> Self {
-        assert!(rps > 0.0, "rps must be positive");
-        assert!(max > 0.0, "max must be positive");
-        Self { rps, max }
-    }
-}
-
-impl LeakyBucketState {
-    pub fn new() -> Self {
-        Self {
-            filled: 0.0,
-            time: Instant::now(),
-        }
-    }
-
-    /// updates the timer and returns true if the bucket is empty
-    fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
-        let drain = now.duration_since(self.time);
-        let drain = drain.as_secs_f64() * info.rps;
-
-        self.filled = (self.filled - drain).clamp(0.0, info.max);
-        self.time = now;
-
-        self.filled == 0.0
-    }
-
-    pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
-        self.update(info, now);
-
-        if self.filled + n > info.max {
-            return false;
-        }
-        self.filled += n;
-
-        true
-    }
-}
-
-impl Default for LeakyBucketState {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::time::Duration;
-
-    use tokio::time::Instant;
-
-    use super::{LeakyBucketConfig, LeakyBucketState};
-
-    #[tokio::test(start_paused = true)]
-    async fn check() {
-        let info = LeakyBucketConfig::new(500.0, 2000.0);
-        let mut bucket = LeakyBucketState::new();
-
-        // should work for 2000 requests this second
-        for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
-        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-        assert_eq!(bucket.filled, 2000.0);
-
-        // in 1ms we should drain 0.5 tokens.
-        // make sure we don't lose any tokens
-        tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-        tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(bucket.check(&info, Instant::now(), 1.0));
-
-        // in 10ms we should drain 5 tokens
-        tokio::time::advance(Duration::from_millis(10)).await;
-        for _ in 0..5 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
-        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-
-        // in 10s we should drain 5000 tokens
-        // but cap is only 2000
-        tokio::time::advance(Duration::from_secs(10)).await;
-        for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
-        }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-
-        // should sustain 500rps
-        for _ in 0..2000 {
-            tokio::time::advance(Duration::from_millis(10)).await;
-            for _ in 0..5 {
-                assert!(bucket.check(&info, Instant::now(), 1.0));
-            }
-        }
-    }
-}
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -61,7 +61,7 @@ impl GlobalRateLimiter {
 // Purposefully ignore user name and database name as clients can reconnect
 // with different names, so we'll end up sending some http requests to
 // the control plane.
-pub type WakeComputeRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
+pub type EndpointRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;

 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
    map: DashMap<Key, Vec<RateBucket>, Hasher>,
@@ -103,7 +103,7 @@ pub struct RateBucketInfo {

 impl std::fmt::Display for RateBucketInfo {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let rps = self.rps().floor() as u64;
+        let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64;
        write!(f, "{rps}@{}", humantime::format_duration(self.interval))
    }
 }
@@ -140,10 +140,6 @@ impl RateBucketInfo {
        Self::new(200, Duration::from_secs(600)),
    ];

-    pub fn rps(&self) -> f64 {
-        (self.max_rpi as f64) / self.interval.as_secs_f64()
-    }
-
    pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
        info.sort_unstable_by_key(|info| info.interval);
        let invalid = info
@@ -249,7 +245,7 @@ mod tests {
    use rustc_hash::FxHasher;
    use tokio::time;

-    use super::{BucketRateLimiter, WakeComputeRateLimiter};
+    use super::{BucketRateLimiter, EndpointRateLimiter};
    use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};

    #[test]
@@ -297,7 +293,7 @@ mod tests {
            .map(|s| s.parse().unwrap())
            .collect();
        RateBucketInfo::validate(&mut rates).unwrap();
-        let limiter = WakeComputeRateLimiter::new(rates);
+        let limiter = EndpointRateLimiter::new(rates);

        let endpoint = EndpointId::from("ep-my-endpoint-1234");
        let endpoint = EndpointIdInt::from(endpoint);
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -22,8 +22,7 @@ struct HeartbeaterTask {

    state: HashMap<NodeId, PageserverState>,

-    max_offline_interval: Duration,
-    max_warming_up_interval: Duration,
+    max_unavailable_interval: Duration,
    jwt_token: Option<String>,
 }

@@ -32,9 +31,7 @@ pub(crate) enum PageserverState {
    Available {
        last_seen_at: Instant,
        utilization: PageserverUtilization,
-    },
-    WarmingUp {
-        started_at: Instant,
+        new: bool,
    },
    Offline,
 }
@@ -60,18 +57,12 @@ pub(crate) struct Heartbeater {
 impl Heartbeater {
    pub(crate) fn new(
        jwt_token: Option<String>,
-        max_offline_interval: Duration,
-        max_warming_up_interval: Duration,
+        max_unavailable_interval: Duration,
        cancel: CancellationToken,
    ) -> Self {
        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
-        let mut heartbeater = HeartbeaterTask::new(
-            receiver,
-            jwt_token,
-            max_offline_interval,
-            max_warming_up_interval,
-            cancel,
-        );
+        let mut heartbeater =
+            HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
        tokio::task::spawn(async move { heartbeater.run().await });

        Self { sender }
@@ -97,16 +88,14 @@ impl HeartbeaterTask {
    fn new(
        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
        jwt_token: Option<String>,
-        max_offline_interval: Duration,
-        max_warming_up_interval: Duration,
+        max_unavailable_interval: Duration,
        cancel: CancellationToken,
    ) -> Self {
        Self {
            receiver,
            cancel,
            state: HashMap::new(),
-            max_offline_interval,
-            max_warming_up_interval,
+            max_unavailable_interval,
            jwt_token,
        }
    }
@@ -139,15 +128,16 @@ impl HeartbeaterTask {
            heartbeat_futs.push({
                let jwt_token = self.jwt_token.clone();
                let cancel = self.cancel.clone();
+                let new_node = !self.state.contains_key(node_id);

                // Clone the node and mark it as available such that the request
                // goes through to the pageserver even when the node is marked offline.
                // This doesn't impact the availability observed by [`crate::service::Service`].
-                let mut node_clone = node.clone();
-                node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                let mut node = node.clone();
+                node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));

                async move {
-                    let response = node_clone
+                    let response = node
                        .with_client_retries(
                            |client| async move { client.get_utilization().await },
                            &jwt_token,
@@ -171,12 +161,7 @@ impl HeartbeaterTask {
                        PageserverState::Available {
                            last_seen_at: Instant::now(),
                            utilization,
-                        }
-                    } else if let NodeAvailability::WarmingUp(last_seen_at) =
-                        node.get_availability()
-                    {
-                        PageserverState::WarmingUp {
-                            started_at: last_seen_at,
+                            new: new_node,
                        }
                    } else {
                        PageserverState::Offline
@@ -202,67 +187,53 @@ impl HeartbeaterTask {
                }
            }
        }
-
-        let mut warming_up = 0;
-        let mut offline = 0;
-        for state in new_state.values() {
-            match state {
-                PageserverState::WarmingUp { .. } => {
-                    warming_up += 1;
-                }
-                PageserverState::Offline { .. } => offline += 1,
-                PageserverState::Available { .. } => {}
-            }
-        }
-
        tracing::info!(
-            "Heartbeat round complete for {} nodes, {} warming-up, {} offline",
+            "Heartbeat round complete for {} nodes, {} offline",
            new_state.len(),
-            warming_up,
-            offline
+            new_state
+                .values()
+                .filter(|s| match s {
+                    PageserverState::Available { .. } => {
+                        false
+                    }
+                    PageserverState::Offline => true,
+                })
+                .count()
        );

        let mut deltas = Vec::new();
        let now = Instant::now();
-        for (node_id, ps_state) in new_state.iter_mut() {
+        for (node_id, ps_state) in new_state {
            use std::collections::hash_map::Entry::*;
-            let entry = self.state.entry(*node_id);
+            let entry = self.state.entry(node_id);

            let mut needs_update = false;
            match entry {
                Occupied(ref occ) => match (occ.get(), &ps_state) {
                    (PageserverState::Offline, PageserverState::Offline) => {}
                    (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
-                        if now - *last_seen_at >= self.max_offline_interval {
-                            deltas.push((*node_id, ps_state.clone()));
+                        if now - *last_seen_at >= self.max_unavailable_interval {
+                            deltas.push((node_id, ps_state.clone()));
                            needs_update = true;
                        }
                    }
-                    (_, PageserverState::WarmingUp { started_at }) => {
-                        if now - *started_at >= self.max_warming_up_interval {
-                            *ps_state = PageserverState::Offline;
-                        }
-
-                        deltas.push((*node_id, ps_state.clone()));
-                        needs_update = true;
-                    }
                    _ => {
-                        deltas.push((*node_id, ps_state.clone()));
+                        deltas.push((node_id, ps_state.clone()));
                        needs_update = true;
                    }
                },
                Vacant(_) => {
                    // This is a new node. Don't generate a delta for it.
-                    deltas.push((*node_id, ps_state.clone()));
+                    deltas.push((node_id, ps_state.clone()));
                }
            }

            match entry {
                Occupied(mut occ) if needs_update => {
-                    (*occ.get_mut()) = ps_state.clone();
+                    (*occ.get_mut()) = ps_state;
                }
                Vacant(vac) => {
-                    vac.insert(ps_state.clone());
+                    vac.insert(ps_state);
                }
                _ => {}
            }
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -10,8 +10,7 @@ use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::{
-    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    RECONCILER_CONCURRENCY_DEFAULT,
+    Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -62,12 +61,7 @@ struct Cli {

    /// Grace period before marking unresponsive pageserver offline
    #[arg(long)]
-    max_offline_interval: Option<humantime::Duration>,
-
-    /// More tolerant grace period before marking unresponsive pagserver offline used
-    /// around pageserver restarts
-    #[arg(long)]
-    max_warming_up_interval: Option<humantime::Duration>,
+    max_unavailable_interval: Option<humantime::Duration>,

    /// Size threshold for automatically splitting shards (disabled by default)
    #[arg(long)]
@@ -260,14 +254,10 @@ async fn async_main() -> anyhow::Result<()> {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
        compute_hook_url: args.compute_hook_url,
-        max_offline_interval: args
-            .max_offline_interval
+        max_unavailable_interval: args
+            .max_unavailable_interval
            .map(humantime::Duration::into)
-            .unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT),
-        max_warming_up_interval: args
-            .max_warming_up_interval
-            .map(humantime::Duration::into)
-            .unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT),
+            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
        reconciler_concurrency: args
            .reconciler_concurrency
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        TenantLocateResponseShard, UtilizationScore,
    },
    shard::TenantShardId,
 };
@@ -46,8 +46,6 @@ pub(crate) struct Node {
 /// whether/how they changed it.
 pub(crate) enum AvailabilityTransition {
    ToActive,
-    ToWarmingUpFromActive,
-    ToWarmingUpFromOffline,
    ToOffline,
    Unchanged,
 }
@@ -92,34 +90,22 @@ impl Node {
        }
    }

-    pub(crate) fn get_availability(&self) -> NodeAvailability {
-        self.availability
-    }
-
    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
-        use AvailabilityTransition::*;
-        use NodeAvailability::WarmingUp;
-
        match self.get_availability_transition(availability) {
-            ToActive => {
+            AvailabilityTransition::ToActive => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
                // state.  For example, Reconcilers in flight will have to complete and be spawned
                // again to realize that the node has become available.
                self.cancel = CancellationToken::new();
            }
-            ToOffline | ToWarmingUpFromActive => {
+            AvailabilityTransition::ToOffline => {
                // Fire the node's cancellation token to cancel any in-flight API requests to it
                self.cancel.cancel();
            }
-            Unchanged | ToWarmingUpFromOffline => {}
-        }
-
-        if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
-            self.availability = WarmingUp(std::cmp::max(crnt, proposed));
-        } else {
-            self.availability = availability;
+            AvailabilityTransition::Unchanged => {}
        }
+        self.availability = availability;
    }

    /// Without modifying the availability of the node, convert the intended availability
@@ -134,10 +120,16 @@ impl Node {
        match (self.availability, availability) {
            (Offline, Active(_)) => ToActive,
            (Active(_), Offline) => ToOffline,
-            (Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
-            (WarmingUp(_), Offline) => ToOffline,
-            (WarmingUp(_), Active(_)) => ToActive,
-            (Offline, WarmingUp(_)) => ToWarmingUpFromOffline,
+            // Consider the case when the storage controller handles the re-attach of a node
+            // before the heartbeats detect that the node is back online. We still need
+            // [`Service::node_configure`] to attempt reconciliations for shards with an
+            // unknown observed location.
+            // The unsavoury match arm below handles this situation.
+            (Active(lhs), Active(rhs))
+                if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
+            {
+                ToActive
+            }
            _ => Unchanged,
        }
    }
@@ -155,7 +147,7 @@ impl Node {
    pub(crate) fn may_schedule(&self) -> MaySchedule {
        let score = match self.availability {
            NodeAvailability::Active(score) => score,
-            NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
+            NodeAvailability::Offline => return MaySchedule::No,
        };

        match self.scheduling {
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -100,13 +100,9 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

 /// How long a node may be unresponsive to heartbeats before we declare it offline.
 /// This must be long enough to cover node restarts as well as normal operations: in future
-pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
-
-/// How long a node may be unresponsive to heartbeats during start up before we declare it
-/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
-/// handling of the re-attach response may take a long time and blocks heartbeats from
-/// being handled on the pageserver side.
-pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
+/// it should be separated into distinct timeouts for startup vs. normal operation
+/// (`<https://github.com/neondatabase/neon/issues/7552>`)
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);

 #[derive(Clone, strum_macros::Display)]
 enum TenantOperations {
@@ -240,12 +236,7 @@ pub struct Config {
    /// Grace period within which a pageserver does not respond to heartbeats, but is still
    /// considered active. Once the grace period elapses, the next heartbeat failure will
    /// mark the pagseserver offline.
-    pub max_offline_interval: Duration,
-
-    /// Extended grace period within which pageserver may not respond to heartbeats.
-    /// This extended grace period kicks in after the node has been drained for restart
-    /// and/or upon handling the re-attach request from a node.
-    pub max_warming_up_interval: Duration,
+    pub max_unavailable_interval: Duration,

    /// How many Reconcilers may be spawned concurrently
    pub reconciler_concurrency: usize,
@@ -596,9 +587,6 @@ impl Service {
                        online_nodes.insert(node_id, utilization);
                    }
                    PageserverState::Offline => {}
-                    PageserverState::WarmingUp { .. } => {
-                        unreachable!("Nodes are never marked warming-up during startup reconcile")
-                    }
                }
            }
        }
@@ -791,54 +779,61 @@ impl Service {
            let res = self.heartbeater.heartbeat(nodes).await;
            if let Ok(deltas) = res {
                for (node_id, state) in deltas.0 {
-                    let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
-                            UtilizationScore(utilization.utilization_score),
+                    let (new_node, new_availability) = match state {
+                        PageserverState::Available {
+                            utilization, new, ..
+                        } => (
+                            new,
+                            NodeAvailability::Active(UtilizationScore(
+                                utilization.utilization_score,
+                            )),
                        ),
-                        PageserverState::WarmingUp { started_at } => {
-                            NodeAvailability::WarmingUp(started_at)
-                        }
-                        PageserverState::Offline => {
-                            // The node might have been placed in the WarmingUp state
-                            // while the heartbeat round was on-going. Hence, filter out
-                            // offline transitions for WarmingUp nodes that are still within
-                            // their grace period.
-                            if let Ok(NodeAvailability::WarmingUp(started_at)) =
-                                self.get_node(node_id).await.map(|n| n.get_availability())
-                            {
-                                let now = Instant::now();
-                                if now - started_at >= self.config.max_warming_up_interval {
-                                    NodeAvailability::Offline
-                                } else {
-                                    NodeAvailability::WarmingUp(started_at)
-                                }
-                            } else {
-                                NodeAvailability::Offline
-                            }
-                        }
+                        PageserverState::Offline => (false, NodeAvailability::Offline),
                    };

-                    // This is the code path for geniune availability transitions (i.e node
-                    // goes unavailable and/or comes back online).
-                    let res = self
-                        .node_configure(node_id, Some(new_availability), None)
-                        .await;
+                    if new_node {
+                        // When the heartbeats detect a newly added node, we don't wish
+                        // to attempt to reconcile the shards assigned to it. The node
+                        // is likely handling it's re-attach response, so reconciling now
+                        // would be counterproductive.
+                        //
+                        // Instead, update the in-memory state with the details learned about the
+                        // node.
+                        let mut locked = self.inner.write().unwrap();
+                        let (nodes, _tenants, scheduler) = locked.parts_mut();

-                    match res {
-                        Ok(()) => {}
-                        Err(ApiError::NotFound(_)) => {
-                            // This should be rare, but legitimate since the heartbeats are done
-                            // on a snapshot of the nodes.
-                            tracing::info!("Node {} was not found after heartbeat round", node_id);
+                        let mut new_nodes = (**nodes).clone();
+
+                        if let Some(node) = new_nodes.get_mut(&node_id) {
+                            node.set_availability(new_availability);
+                            scheduler.node_upsert(node);
                        }
-                        Err(err) => {
-                            // Transition to active involves reconciling: if a node responds to a heartbeat then
-                            // becomes unavailable again, we may get an error here.
-                            tracing::error!(
-                                "Failed to update node {} after heartbeat round: {}",
-                                node_id,
-                                err
-                            );
+
+                        locked.nodes = Arc::new(new_nodes);
+                    } else {
+                        // This is the code path for geniune availability transitions (i.e node
+                        // goes unavailable and/or comes back online).
+                        let res = self
+                            .node_configure(node_id, Some(new_availability), None)
+                            .await;
+
+                        match res {
+                            Ok(()) => {}
+                            Err(ApiError::NotFound(_)) => {
+                                // This should be rare, but legitimate since the heartbeats are done
+                                // on a snapshot of the nodes.
+                                tracing::info!(
+                                    "Node {} was not found after heartbeat round",
+                                    node_id
+                                );
+                            }
+                            Err(err) => {
+                                tracing::error!(
+                                    "Failed to update node {} after heartbeat round: {}",
+                                    node_id,
+                                    err
+                                );
+                            }
                        }
                    }
                }
@@ -1155,8 +1150,7 @@ impl Service {
        let cancel = CancellationToken::new();
        let heartbeater = Heartbeater::new(
            config.jwt_token.clone(),
-            config.max_offline_interval,
-            config.max_warming_up_interval,
+            config.max_unavailable_interval,
            cancel.clone(),
        );
        let this = Arc::new(Self {
@@ -1668,23 +1662,21 @@ impl Service {
                    | NodeSchedulingPolicy::Filling
            );

-            let mut new_nodes = (**nodes).clone();
-            if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
-                if reset_scheduling {
-                    node.set_scheduling(NodeSchedulingPolicy::Active);
+            if !node.is_available() || reset_scheduling {
+                let mut new_nodes = (**nodes).clone();
+                if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
+                    if !node.is_available() {
+                        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    }
+
+                    if reset_scheduling {
+                        node.set_scheduling(NodeSchedulingPolicy::Active);
+                    }
+
+                    scheduler.node_upsert(node);
+                    let new_nodes = Arc::new(new_nodes);
+                    *nodes = new_nodes;
                }
-
-                tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id);
-                node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now()));
-
-                scheduler.node_upsert(node);
-                let new_nodes = Arc::new(new_nodes);
-                *nodes = new_nodes;
-            } else {
-                tracing::error!(
-                    "Reattaching node {} was removed while processing the request",
-                    reattach_req.node_id
-                );
            }
        }

@@ -4725,15 +4717,6 @@ impl Service {

                // TODO: in the background, we should balance work back onto this pageserver
            }
-            // No action required for the intermediate unavailable state.
-            // When we transition into active or offline from the unavailable state,
-            // the correct handling above will kick in.
-            AvailabilityTransition::ToWarmingUpFromActive => {
-                tracing::info!("Node {} transition to unavailable from active", node_id);
-            }
-            AvailabilityTransition::ToWarmingUpFromOffline => {
-                tracing::info!("Node {} transition to unavailable from offline", node_id);
-            }
            AvailabilityTransition::Unchanged => {
                tracing::debug!("Node {} no availability change during config", node_id);
            }
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -49,5 +49,6 @@ tracing.workspace = true
 tracing-subscriber.workspace = true
 clap.workspace = true
 tracing-appender = "0.2"
+histogram = "0.7"

 futures.workspace = true
--- a/storage_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -45,11 +45,7 @@ processing by the `purge-garbage` subcommand.

 Example:

-`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=[client_key] CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
-
-Note that `CLOUD_ADMIN_API_TOKEN` can be obtained from https://console-stage.neon.build/app/settings/api-keys (for staging) or https://console.neon.tech/app/settings/api-keys for production. This is not the control plane admin JWT key. The env var name is confusing. Though anyone can generate that API key, you still need admin permission in order to access all projects in the region.
-
-And note that `CLOUD_ADMIN_API_URL` should include the region in the admin URL due to the control plane / console split. For example, `https://console-stage.neon.build/regions/aws-us-east-2/api/v1/admin` for the staging us-east-2 region.
+`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`

 #### `purge-garbage`

@@ -65,7 +61,7 @@ to pass them on the command line

 Example:

-`env AWS_PROFILE=dev cargo run --release -- purge-garbage --input-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`

 Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
 not provided inline in the example above to avoid accidents.  Without the `--delete` flag
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -2,7 +2,6 @@ use std::collections::{HashMap, HashSet};

 use anyhow::Context;
 use aws_sdk_s3::Client;
-use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
@@ -13,7 +12,7 @@ use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
-use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
@@ -42,9 +41,7 @@ impl TimelineAnalysis {
    }
 }

-pub(crate) async fn branch_cleanup_and_check_errors(
-    s3_client: &Client,
-    target: &RootTarget,
+pub(crate) fn branch_cleanup_and_check_errors(
    id: &TenantShardTimelineId,
    tenant_objects: &mut TenantObjectListing,
    s3_active_branch: Option<&BranchData>,
@@ -88,17 +85,15 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                    }

                    if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
-                        info!(
+                        result.warnings.push(format!(
                            "index_part.json version is not latest: {}",
                            index_part.version()
-                        );
+                        ))
                    }

                    if index_part.metadata.disk_consistent_lsn()
                        != index_part.duplicated_disk_consistent_lsn()
                    {
-                        // Tech debt: let's get rid of one of these, they are redundant
-                        // https://github.com/neondatabase/neon/issues/8343
                        result.errors.push(format!(
                            "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
                            index_part.metadata.disk_consistent_lsn(),
@@ -107,16 +102,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                    }

                    if index_part.layer_metadata.is_empty() {
-                        if index_part.metadata.ancestor_timeline().is_none() {
-                            // The initial timeline with no ancestor should ALWAYS have layers.
-                            result.errors.push(
-                                "index_part.json has no layers (ancestor_timeline=None)"
-                                    .to_string(),
-                            );
-                        } else {
-                            // Not an error, can happen for branches with zero writes, but notice that
-                            info!("index_part.json has no layers (ancestor_timeline exists)");
-                        }
+                        // not an error, can happen for branches with zero writes, but notice that
+                        info!("index_part.json has no layers");
                    }

                    for (layer, metadata) in index_part.layer_metadata {
@@ -127,41 +114,16 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                        }

                        if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
-                            let path = remote_layer_path(
-                                &id.tenant_shard_id.tenant_id,
-                                &id.timeline_id,
-                                metadata.shard,
-                                &layer,
-                                metadata.generation,
-                            );
-
-                            // HEAD request used here to address a race condition  when an index was uploaded concurrently
-                            // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
-                            let response = s3_client
-                                .head_object()
-                                .bucket(target.bucket_name())
-                                .key(path.get_path().as_str())
-                                .send()
-                                .await;
-
-                            if response.is_err() {
-                                // Object is not present.
-                                let is_l0 = LayerMap::is_l0(layer.key_range());
-
-                                let msg = format!(
-                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
-                                    layer,
-                                    metadata.generation.get_suffix(),
-                                    metadata.shard,
-                                    is_l0,
-                                );
-
-                                if is_l0 {
-                                    result.warnings.push(msg);
-                                } else {
-                                    result.errors.push(msg);
-                                }
-                            }
+                            // FIXME: this will emit false positives if an index was
+                            // uploaded concurrently with our scan.  To make this check
+                            // correct, we need to try sending a HEAD request for the
+                            // layer we think is missing.
+                            result.errors.push(format!(
+                                "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
+                                layer,
+                                metadata.generation.get_suffix(),
+                                metadata.shard
+                            ))
                        }
                    }
                }
@@ -341,9 +303,6 @@ pub(crate) async fn list_timeline_blobs(
                tracing::debug!("initdb archive {key}");
                initdb_archive = true;
            }
-            Some("initdb-preserved.tar.zst") => {
-                tracing::info!("initdb archive preserved {key}");
-            }
            Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                Ok((new_layer, gen)) => {
                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -8,19 +8,21 @@ use std::{
 };

 use anyhow::Context;
+use aws_sdk_s3::{
+    types::{Delete, ObjectIdentifier},
+    Client,
+};
 use futures_util::TryStreamExt;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath};
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
-use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;

 use crate::{
    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote, init_remote_generic,
-    metadata_stream::{stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
+    init_remote,
+    metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth,
 };

 #[derive(Serialize, Deserialize, Debug)]
@@ -322,45 +324,41 @@ impl std::fmt::Display for PurgeMode {
 }

 pub async fn get_tenant_objects(
-    s3_client: &GenericRemoteStorage,
+    s3_client: &Arc<Client>,
+    target: RootTarget,
    tenant_shard_id: TenantShardId,
-) -> anyhow::Result<Vec<RemotePath>> {
+) -> anyhow::Result<Vec<ObjectIdentifier>> {
    tracing::debug!("Listing objects in tenant {tenant_shard_id}");
-    let tenant_root = super::remote_tenant_path(&tenant_shard_id);
-
    // TODO: apply extra validation based on object modification time.  Don't purge
    // tenants where any timeline's index_part.json has been touched recently.

-    let list = s3_client
-        .list(
-            Some(&tenant_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
-    Ok(list.keys)
+    let mut tenant_root = target.tenant_root(&tenant_shard_id);
+
+    // Remove delimiter, so that object listing lists all keys in the prefix and not just
+    // common prefixes.
+    tenant_root.delimiter = String::new();
+
+    let key_stream = stream_listing(s3_client, &tenant_root);
+    key_stream.try_collect().await
 }

 pub async fn get_timeline_objects(
-    s3_client: &GenericRemoteStorage,
+    s3_client: &Arc<Client>,
+    target: RootTarget,
    ttid: TenantShardTimelineId,
-) -> anyhow::Result<Vec<RemotePath>> {
+) -> anyhow::Result<Vec<ObjectIdentifier>> {
    tracing::debug!("Listing objects in timeline {ttid}");
-    let timeline_root = super::remote_timeline_path_id(&ttid);
+    let mut timeline_root = target.timeline_root(&ttid);

    // TODO: apply extra validation based on object modification time.  Don't purge
    // timelines whose index_part.json has been touched recently.

-    let list = s3_client
-        .list(
-            Some(&timeline_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
-    Ok(list.keys)
+    // Remove delimiter, so that object listing lists all keys in the prefix and not just
+    // common prefixes.
+    timeline_root.delimiter = String::new();
+    let key_stream = stream_listing(s3_client, &timeline_root);
+
+    key_stream.try_collect().await
 }

 const MAX_KEYS_PER_DELETE: usize = 1000;
@@ -371,17 +369,16 @@ const MAX_KEYS_PER_DELETE: usize = 1000;
 /// MAX_KEYS_PER_DELETE keys are left.
 /// `num_deleted` returns number of deleted keys.
 async fn do_delete(
-    remote_client: &GenericRemoteStorage,
-    keys: &mut Vec<RemotePath>,
+    s3_client: &Arc<Client>,
+    bucket_name: &str,
+    keys: &mut Vec<ObjectIdentifier>,
    dry_run: bool,
    drain: bool,
    progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
-    let cancel = CancellationToken::new();
    while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
        let request_keys =
            keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
-
        let num_deleted = request_keys.len();
        if dry_run {
            tracing::info!("Dry-run deletion of objects: ");
@@ -389,10 +386,14 @@ async fn do_delete(
                tracing::info!("  {k:?}");
            }
        } else {
-            remote_client
-                .delete_objects(&request_keys, &cancel)
+            let delete_request = s3_client
+                .delete_objects()
+                .bucket(bucket_name)
+                .delete(Delete::builder().set_objects(Some(request_keys)).build()?);
+            delete_request
+                .send()
                .await
-                .context("deletetion request")?;
+                .context("DeleteObjects request")?;
            progress_tracker.register(num_deleted);
        }
    }
@@ -430,13 +431,8 @@ pub async fn purge_garbage(
        input_path
    );

-    let remote_client =
-        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
-
-    assert_eq!(
-        &garbage_list.bucket_config.bucket,
-        remote_client.bucket_name().unwrap()
-    );
+    let (s3_client, target) =
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;

    // Sanity checks on the incoming list
    if garbage_list.active_tenant_count == 0 {
@@ -468,13 +464,16 @@ pub async fn purge_garbage(

    let items = tokio_stream::iter(filtered_items.map(Ok));
    let get_objects_results = items.map_ok(|i| {
-        let remote_client = remote_client.clone();
+        let s3_client = s3_client.clone();
+        let target = target.clone();
        async move {
            match i.entity {
                GarbageEntity::Tenant(tenant_id) => {
-                    get_tenant_objects(&remote_client, tenant_id).await
+                    get_tenant_objects(&s3_client, target, tenant_id).await
+                }
+                GarbageEntity::Timeline(ttid) => {
+                    get_timeline_objects(&s3_client, target, ttid).await
                }
-                GarbageEntity::Timeline(ttid) => get_timeline_objects(&remote_client, ttid).await,
            }
        }
    });
@@ -488,7 +487,8 @@ pub async fn purge_garbage(
        objects_to_delete.append(&mut object_list);
        if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
            do_delete(
-                &remote_client,
+                &s3_client,
+                &garbage_list.bucket_config.bucket,
                &mut objects_to_delete,
                dry_run,
                false,
@@ -499,7 +499,8 @@ pub async fn purge_garbage(
    }

    do_delete(
-        &remote_client,
+        &s3_client,
+        &garbage_list.bucket_config.bucket,
        &mut objects_to_delete,
        dry_run,
        true,
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -22,13 +22,9 @@ use aws_sdk_s3::Client;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
-use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
-    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
-};
+use remote_storage::RemotePath;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use tokio::io::AsyncReadExt;
@@ -219,10 +215,6 @@ impl RootTarget {
    }
 }

-pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath {
-    remote_timeline_path(&id.tenant_shard_id, &id.timeline_id)
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct BucketConfig {
@@ -304,7 +296,7 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
    }
 }

-async fn init_s3_client(bucket_region: Region) -> Client {
+pub async fn init_s3_client(bucket_region: Region) -> Client {
    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
        .region(bucket_region)
        .load()
@@ -312,13 +304,6 @@ async fn init_s3_client(bucket_region: Region) -> Client {
    Client::new(&config)
 }

-fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
-    match node_kind {
-        NodeKind::Pageserver => "pageserver/v1/",
-        NodeKind::Safekeeper => "wal/",
-    }
-}
-
 async fn init_remote(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
@@ -326,17 +311,18 @@ async fn init_remote(
    let bucket_region = Region::new(bucket_config.region);
    let delimiter = "/".to_string();
    let s3_client = Arc::new(init_s3_client(bucket_region).await);
-    let default_prefix = default_prefix_in_bucket(node_kind).to_string();

    let s3_root = match node_kind {
        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            prefix_in_bucket: bucket_config
+                .prefix_in_bucket
+                .unwrap_or("pageserver/v1".to_string()),
            delimiter,
        }),
        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
            delimiter,
        }),
    };
@@ -344,31 +330,6 @@ async fn init_remote(
    Ok((s3_client, s3_root))
 }

-async fn init_remote_generic(
-    bucket_config: BucketConfig,
-    node_kind: NodeKind,
-) -> anyhow::Result<GenericRemoteStorage> {
-    let endpoint = env::var("AWS_ENDPOINT_URL").ok();
-    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
-    let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
-    let storage = S3Config {
-        bucket_name: bucket_config.bucket,
-        bucket_region: bucket_config.region,
-        prefix_in_bucket,
-        endpoint,
-        concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
-            .try_into()
-            .unwrap(),
-        max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-        upload_storage_class: None,
-    };
-    let storage_config = RemoteStorageConfig {
-        storage: RemoteStorageKind::AwsS3(storage),
-        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
-    };
-    GenericRemoteStorage::from_config(&storage_config).await
-}
-
 async fn list_objects_with_retries(
    s3_client: &Client,
    s3_target: &S3Target,
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -8,11 +8,12 @@ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
+use histogram::Histogram;
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
+use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
-use utils::shard::ShardCount;

 #[derive(Serialize)]
 pub struct MetadataSummary {
@@ -23,6 +24,66 @@ pub struct MetadataSummary {
    with_warnings: HashSet<TenantShardTimelineId>,
    with_orphans: HashSet<TenantShardTimelineId>,
    indices_by_version: HashMap<usize, usize>,
+
+    layer_count: MinMaxHisto,
+    timeline_size_bytes: MinMaxHisto,
+    layer_size_bytes: MinMaxHisto,
+}
+
+/// A histogram plus minimum and maximum tracking
+#[derive(Serialize)]
+struct MinMaxHisto {
+    #[serde(skip)]
+    histo: Histogram,
+    min: u64,
+    max: u64,
+}
+
+impl MinMaxHisto {
+    fn new() -> Self {
+        Self {
+            histo: histogram::Histogram::builder()
+                .build()
+                .expect("Bad histogram params"),
+            min: u64::MAX,
+            max: 0,
+        }
+    }
+
+    fn sample(&mut self, v: u64) -> Result<(), histogram::Error> {
+        self.min = std::cmp::min(self.min, v);
+        self.max = std::cmp::max(self.max, v);
+        let r = self.histo.increment(v, 1);
+
+        if r.is_err() {
+            tracing::warn!("Bad histogram sample: {v}");
+        }
+
+        r
+    }
+
+    fn oneline(&self) -> String {
+        let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) {
+            Ok(p) => p,
+            Err(e) => return format!("No data: {}", e),
+        };
+
+        let percentiles: Vec<u64> = percentiles
+            .iter()
+            .map(|p| p.bucket().low() + p.bucket().high() / 2)
+            .collect();
+
+        format!(
+            "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}",
+            self.min,
+            percentiles[0],
+            percentiles[1],
+            percentiles[2],
+            percentiles[3],
+            percentiles[4],
+            self.max,
+        )
+    }
 }

 impl MetadataSummary {
@@ -35,9 +96,25 @@ impl MetadataSummary {
            with_warnings: HashSet::new(),
            with_orphans: HashSet::new(),
            indices_by_version: HashMap::new(),
+            layer_count: MinMaxHisto::new(),
+            timeline_size_bytes: MinMaxHisto::new(),
+            layer_size_bytes: MinMaxHisto::new(),
        }
    }

+    fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> {
+        self.layer_count
+            .sample(index_part.layer_metadata.len() as u64)?;
+        let mut total_size: u64 = 0;
+        for meta in index_part.layer_metadata.values() {
+            total_size += meta.file_size;
+            self.layer_size_bytes.sample(meta.file_size)?;
+        }
+        self.timeline_size_bytes.sample(total_size)?;
+
+        Ok(())
+    }
+
    fn update_data(&mut self, data: &S3TimelineBlobData) {
        self.timeline_shard_count += 1;
        if let BlobDataParseResult::Parsed {
@@ -50,6 +127,14 @@ impl MetadataSummary {
                .indices_by_version
                .entry(index_part.version())
                .or_insert(0) += 1;
+
+            if let Err(e) = self.update_histograms(index_part) {
+                // Value out of range?  Warn that the results are untrustworthy
+                tracing::warn!(
+                    "Error updating histograms, summary stats may be wrong: {}",
+                    e
+                );
+            }
        }
    }

@@ -84,6 +169,9 @@ With errors: {}
 With warnings: {}
 With orphan layers: {}
 Index versions: {version_summary}
+Timeline size bytes: {}
+Layer size bytes: {}
+Timeline layer count: {}
 ",
            self.tenant_count,
            self.timeline_count,
@@ -91,6 +179,9 @@ Index versions: {version_summary}
            self.with_errors.len(),
            self.with_warnings.len(),
            self.with_orphans.len(),
+            self.timeline_size_bytes.oneline(),
+            self.layer_size_bytes.oneline(),
+            self.layer_count.oneline(),
        )
    }

@@ -144,60 +235,33 @@ pub async fn scan_metadata(
    let mut tenant_objects = TenantObjectListing::default();
    let mut tenant_timeline_results = Vec::new();

-    async fn analyze_tenant(
-        s3_client: &Client,
-        target: &RootTarget,
+    fn analyze_tenant(
        tenant_id: TenantId,
        summary: &mut MetadataSummary,
        mut tenant_objects: TenantObjectListing,
        timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
-        highest_shard_count: ShardCount,
    ) {
        summary.tenant_count += 1;

        let mut timeline_ids = HashSet::new();
        let mut timeline_generations = HashMap::new();
        for (ttid, data) in timelines {
-            if ttid.tenant_shard_id.shard_count == highest_shard_count {
-                // Only analyze `TenantShardId`s with highest shard count.
-
-                // Stash the generation of each timeline, for later use identifying orphan layers
-                if let BlobDataParseResult::Parsed {
-                    index_part,
-                    index_part_generation,
-                    s3_layers: _s3_layers,
-                } = &data.blob_data
-                {
-                    if index_part.deleted_at.is_some() {
-                        // skip deleted timeline.
-                        tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid);
-                        continue;
-                    }
-                    timeline_generations.insert(ttid, *index_part_generation);
-                }
-
-                // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
-                // reference counts for layers across the tenant.
-                let analysis = branch_cleanup_and_check_errors(
-                    s3_client,
-                    target,
-                    &ttid,
-                    &mut tenant_objects,
-                    None,
-                    None,
-                    Some(data),
-                )
-                .await;
-                summary.update_analysis(&ttid, &analysis);
-
-                timeline_ids.insert(ttid.timeline_id);
-            } else {
-                tracing::info!(
-                    "Skip analysis of {} b/c a lower shard count than {}",
-                    ttid,
-                    highest_shard_count.0,
-                );
+            timeline_ids.insert(ttid.timeline_id);
+            // Stash the generation of each timeline, for later use identifying orphan layers
+            if let BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation,
+                s3_layers: _s3_layers,
+            } = &data.blob_data
+            {
+                timeline_generations.insert(ttid, *index_part_generation);
            }
+
+            // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+            // reference counts for layers across the tenant.
+            let analysis =
+                branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
+            summary.update_analysis(&ttid, &analysis);
        }

        summary.timeline_count += timeline_ids.len();
@@ -245,35 +309,18 @@ pub async fn scan_metadata(
    // all results for the same tenant will be adjacent.  We accumulate these,
    // and then call `analyze_tenant` to flush, when we see the next tenant ID.
    let mut summary = MetadataSummary::new();
-    let mut highest_shard_count = ShardCount::MIN;
    while let Some(i) = timelines.next().await {
        let (ttid, data) = i?;
        summary.update_data(&data);

        match tenant_id {
-            None => {
-                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
-                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
-            }
+            None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
            Some(prev_tenant_id) => {
                if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
-                    // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
                    let tenant_objects = std::mem::take(&mut tenant_objects);
                    let timelines = std::mem::take(&mut tenant_timeline_results);
-                    analyze_tenant(
-                        &s3_client,
-                        &target,
-                        prev_tenant_id,
-                        &mut summary,
-                        tenant_objects,
-                        timelines,
-                        highest_shard_count,
-                    )
-                    .await;
+                    analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
                    tenant_id = Some(ttid.tenant_shard_id.tenant_id);
-                    highest_shard_count = ttid.tenant_shard_id.shard_count;
-                } else {
-                    highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
                }
            }
        }
@@ -291,15 +338,11 @@ pub async fn scan_metadata(

    if !tenant_timeline_results.is_empty() {
        analyze_tenant(
-            &s3_client,
-            &target,
            tenant_id.expect("Must be set if results are present"),
            &mut summary,
            tenant_objects,
            tenant_timeline_results,
-            highest_shard_count,
-        )
-        .await;
+        );
    }

    Ok(summary)
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -143,9 +143,6 @@ class TimelineId(Id):
    def __repr__(self) -> str:
        return f'TimelineId("{self.id.hex()}")'

-    def __str__(self) -> str:
-        return self.id.hex()
-

 # Workaround for compat with python 3.9, which does not have `typing.Self`
 TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -133,8 +133,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    *histogram("pageserver_remote_operation_seconds"),
    *histogram("pageserver_io_operations_seconds"),
    "pageserver_tenant_states_count",
-    "pageserver_circuit_breaker_broken_total",
-    "pageserver_circuit_breaker_unbroken_total",
 )

 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -523,7 +523,7 @@ class NeonEnvBuilder:
        self.preserve_database_files = preserve_database_files
        self.initial_tenant = initial_tenant or TenantId.generate()
        self.initial_timeline = initial_timeline or TimelineId.generate()
-        self.enable_scrub_on_exit = True
+        self.scrub_on_exit = False
        self.test_output_dir = test_output_dir
        self.test_overlay_dir = test_overlay_dir
        self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
@@ -852,13 +852,6 @@ class NeonEnvBuilder:
        )
        ident_state_dir.rmdir()  # should be empty since we moved `upper` out

-    def disable_scrub_on_exit(self):
-        """
-        Some tests intentionally leave the remote storage contents empty or corrupt,
-        so it doesn't make sense to do the usual scrub at the end of the test.
-        """
-        self.enable_scrub_on_exit = False
-
    def overlay_cleanup_teardown(self):
        """
        Unmount the overlayfs mounts created by `self.overlay_mount()`.
@@ -884,6 +877,23 @@ class NeonEnvBuilder:
        # assert all overlayfs mounts in our test directory are gone
        assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir))

+    def enable_scrub_on_exit(self):
+        """
+        Call this if you would like the fixture to automatically run
+        storage_scrubber at the end of the test, as a bidirectional test
+        that the scrubber is working properly, and that the code within
+        the test didn't produce any invalid remote state.
+        """
+
+        if not isinstance(self.pageserver_remote_storage, S3Storage):
+            # The scrubber can't talk to e.g. LocalFS -- it needs
+            # an HTTP endpoint (mock is fine) to connect to.
+            raise RuntimeError(
+                "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint"
+            )
+
+        self.scrub_on_exit = True
+
    def enable_pageserver_remote_storage(
        self,
        remote_storage_kind: RemoteStorageKind,
@@ -985,12 +995,7 @@ class NeonEnvBuilder:
            )
            cleanup_error = None

-            # If we are running with S3Storage (required by the scrubber), check that whatever the test
-            # did does not generate any corruption
-            if (
-                isinstance(self.env.pageserver_remote_storage, S3Storage)
-                and self.enable_scrub_on_exit
-            ):
+            if self.scrub_on_exit:
                try:
                    self.env.storage_scrubber.scan_metadata()
                except Exception as e:
@@ -2143,23 +2148,6 @@ class StorageControllerApiException(Exception):
        self.status_code = status_code


-# See libs/pageserver_api/src/controller_api.rs
-# for the rust definitions of the enums below
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class PageserverAvailability(str, Enum):
-    ACTIVE = "Active"
-    UNAVAILABLE = "Unavailable"
-    OFFLINE = "Offline"
-
-
-class PageserverSchedulingPolicy(str, Enum):
-    ACTIVE = "Active"
-    DRAINING = "Draining"
-    FILLING = "Filling"
-    PAUSE = "Pause"
-    PAUSE_FOR_RESTART = "PauseForRestart"
-
-
 class NeonStorageController(MetricsGetter, LogUtils):
    def __init__(self, env: NeonEnv, auth_enabled: bool):
        self.env = env
@@ -2543,54 +2531,26 @@ class NeonStorageController(MetricsGetter, LogUtils):
        )
        log.info("storage controller passed consistency check")

-    def node_registered(self, node_id: int) -> bool:
-        """
-        Returns true if the storage controller can confirm
-        it knows of pageserver with 'node_id'
-        """
-        try:
-            self.node_status(node_id)
-        except StorageControllerApiException as e:
-            if e.status_code == 404:
-                return False
-            else:
-                raise e
-
-        return True
-
    def poll_node_status(
-        self,
-        node_id: int,
-        desired_availability: Optional[PageserverAvailability],
-        desired_scheduling_policy: Optional[PageserverSchedulingPolicy],
-        max_attempts: int,
-        backoff: int,
+        self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int
    ):
        """
-        Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability'
-        or 'max_attempts' have been exhausted
+        Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted
        """
-        log.info(
-            f"Polling {node_id} for {desired_scheduling_policy} scheduling policy and {desired_availability} availability"
-        )
+        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
        while max_attempts > 0:
            try:
                status = self.node_status(node_id)
                policy = status["scheduling"]
-                availability = status["availability"]
-                if (desired_scheduling_policy is None or policy == desired_scheduling_policy) and (
-                    desired_availability is None or availability == desired_availability
-                ):
+                if policy == desired_scheduling_policy:
                    return
                else:
                    max_attempts -= 1
-                    log.info(
-                        f"Status call returned {policy=} {availability=} ({max_attempts} attempts left)"
-                    )
+                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")

                    if max_attempts == 0:
                        raise AssertionError(
-                            f"Status for {node_id=} did not reach {desired_scheduling_policy=} {desired_availability=}"
+                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
                        )

                    time.sleep(backoff)
@@ -2734,14 +2694,6 @@ class NeonPageserver(PgProtocol, LogUtils):
            self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
        )
        self.running = True
-
-        if self.env.storage_controller.running and self.env.storage_controller.node_registered(
-            self.id
-        ):
-            self.env.storage_controller.poll_node_status(
-                self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1
-            )
-
        return self

    def stop(self, immediate: bool = False) -> "NeonPageserver":
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -102,7 +102,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
    # failing to connect to them.
    ".*Call to node.*management API.*failed.*receive body.*",
    ".*Call to node.*management API.*failed.*ReceiveBody.*",
-    ".*Failed to update node .+ after heartbeat round.*error sending request for url.*",
    # Many tests will start up with a node offline
    ".*startup_reconcile: Could not scan node.*",
    # Tests run in dev mode
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -662,7 +662,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        force_repartition=False,
        force_image_layer_creation=False,
        wait_until_uploaded=False,
-        compact: Optional[bool] = None,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
@@ -673,9 +672,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        if wait_until_uploaded:
            query["wait_until_uploaded"] = "true"

-        if compact is not None:
-            query["compact"] = "true" if compact else "false"
-
        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -1,4 +1,5 @@
 import concurrent.futures
+import time
 from typing import Any, Callable, Dict, Tuple

 import fixtures.pageserver.remote_storage
@@ -8,6 +9,9 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
 )
+from fixtures.pageserver.utils import (
+    wait_until_tenant_state,
+)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind


@@ -42,33 +46,38 @@ def single_timeline(
    log.info(f"duplicating template tenant {ncopies} times in S3")
    tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)

-    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
-    # However, on-demand downloads are quite slow ATM.
-    # => do the on-demand downloads in Python.
-    log.info("python-side on-demand download the layer files into local tenant dir")
-    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
-    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
-        env, tenant_timelines
-    )
-
    log.info("attach duplicated tenants to pageserver")
    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
    # However, on-demand downloads are quite slow ATM.
    # => do the on-demand downloads in Python.
    assert ps_http.tenant_list() == []
+    # make the attach fail after it created enough on-disk state to retry loading
+    # the tenant next startup, but before it can start background loops that would start download
+    ps_http.configure_failpoints(("attach-before-activate", "return"))
+    env.pageserver.allowed_errors.append(
+        ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
+    )

-    def attach(tenant):
+    def attach_broken(tenant):
        env.pageserver.tenant_attach(
            tenant,
            config=template_config.copy(),
            generation=100,
            override_storage_controller_generation=True,
        )
+        time.sleep(0.1)
+        wait_until_tenant_state(ps_http, tenant, "Broken", 10)

    with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
-        executor.map(attach, tenants)
+        executor.map(attach_broken, tenants)

-    # Benchmarks will start the pageserver explicitly themselves
-    env.pageserver.stop()
+    env.pageserver.stop(
+        immediate=True
+    )  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
+    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+    log.info("python-side on-demand download the layer files into local tenant dir")
+    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+        env, tenant_timelines
+    )

    return env
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -12,9 +12,8 @@ import boto3
 import toml
 from mypy_boto3_s3 import S3Client

-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.pageserver.common_types import IndexPartDump

 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
 TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
@@ -266,38 +265,9 @@ class S3Storage:
    def tenants_path(self) -> str:
        return f"{self.prefix_in_bucket}/tenants"

-    def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str:
+    def tenant_path(self, tenant_id: TenantId) -> str:
        return f"{self.tenants_path()}/{tenant_id}"

-    def timeline_path(
-        self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId
-    ) -> str:
-        return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
-
-    def get_latest_index_key(self, index_keys: List[str]) -> str:
-        """
-        Gets the latest index file key.
-
-        @param index_keys: A list of index keys of different generations.
-        """
-
-        def parse_gen(index_key: str) -> int:
-            parts = index_key.split("index_part.json-")
-            return int(parts[-1], base=16) if len(parts) == 2 else -1
-
-        return max(index_keys, key=parse_gen)
-
-    def download_index_part(self, index_key: str) -> IndexPartDump:
-        """
-        Downloads the index content from remote storage.
-
-        @param index_key: index key in remote storage.
-        """
-        response = self.client.get_object(Bucket=self.bucket_name, Key=index_key)
-        body = response["Body"].read().decode("utf-8")
-        log.info(f"index_part.json: {body}")
-        return IndexPartDump.from_json(json.loads(body))
-
    def heatmap_key(self, tenant_id: TenantId) -> str:
        return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"

--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -17,11 +17,13 @@ from performance.pageserver.util import (
@pytest.mark.parametrize("duration", [30])
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
@pytest.mark.parametrize("n_tenants", [10])
+@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
@pytest.mark.timeout(1000)
 def test_basebackup_with_high_slru_count(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
    pg_bin: PgBin,
+    get_vectored_impl: str,
    n_tenants: int,
    pgbench_scale: int,
    duration: int,
@@ -45,7 +47,7 @@ def test_basebackup_with_high_slru_count(
    max_file_descriptors = 500000
    neon_env_builder.pageserver_config_override = (
        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
-        f"get_vectored_impl='vectored'; validate_vectored_get=false"
+        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
    )
    params.update(
        {
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -135,7 +135,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
    # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
    # this memory estimate can be revised far downwards to something that doesn't scale
    # linearly with the layer sizes.
-    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25

    # If we find that compaction is using more memory, this may indicate a regression
    assert compaction_mapped_rss < MEMORY_ESTIMATE
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -8,12 +8,7 @@ import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PageserverAvailability,
-    PageserverSchedulingPolicy,
-)
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion

@@ -111,8 +106,7 @@ def test_storage_controller_many_tenants(
        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
        # guard against regressions in restart time.
-        "max_offline": "30s",
-        "max_warming_up": "300s",
+        "max_unavailable": "300s"
    }
    neon_env_builder.control_plane_compute_hook_api = (
        compute_reconfigure_listener.control_plane_compute_hook_api
@@ -280,11 +274,7 @@ def test_storage_controller_many_tenants(
        )

        env.storage_controller.poll_node_status(
-            ps.id,
-            PageserverAvailability.ACTIVE,
-            PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
-            max_attempts=24,
-            backoff=5,
+            ps.id, "PauseForRestart", max_attempts=24, backoff=5
        )

        shard_counts = get_consistent_node_shard_counts(env, total_shards)
@@ -295,24 +285,12 @@ def test_storage_controller_many_tenants(
        assert sum(shard_counts.values()) == total_shards

        ps.restart()
-        env.storage_controller.poll_node_status(
-            ps.id,
-            PageserverAvailability.ACTIVE,
-            PageserverSchedulingPolicy.ACTIVE,
-            max_attempts=24,
-            backoff=1,
-        )
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1)

        env.storage_controller.retryable_node_operation(
            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
        )
-        env.storage_controller.poll_node_status(
-            ps.id,
-            PageserverAvailability.ACTIVE,
-            PageserverSchedulingPolicy.ACTIVE,
-            max_attempts=24,
-            backoff=5,
-        )
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5)

        shard_counts = get_consistent_node_shard_counts(env, total_shards)
        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"

 [[package]]
 name = "openssl"
-version = "0.10.66"
+version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
+checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
 "bitflags 2.6.0",
 "cfg-if",
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "openssl-sys"
-version = "0.9.103"
+version = "0.9.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
+checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
 dependencies = [
 "cc",
 "libc",
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -3,16 +3,9 @@ import asyncio
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import RemoteStorageKind
-from werkzeug.wrappers.request import Request
-from werkzeug.wrappers.response import Response


-def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
-    """
-    A relatively low level test of reconfiguring a compute's pageserver at runtime.  Usually this
-    is all done via the storage controller, but this test will disable the storage controller's compute
-    notifications, and instead update endpoints directly.
-    """
+def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    num_connections = 3

    neon_env_builder.num_pageservers = 2
@@ -21,24 +14,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
    )
    env = neon_env_builder.init_start()

-    neon_env_builder.control_plane_compute_hook_api = (
-        f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
-    )
-
-    def ignore_notify(request: Request):
-        # This test does direct updates to compute configuration: disable the storage controller's notification
-        log.info(f"Ignoring storage controller compute notification: {request.json}")
-        return Response(status=200)
-
-    make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(
-        ignore_notify
-    )
-
    env.neon_cli.create_branch("test_change_pageserver")
    endpoint = env.endpoints.create_start("test_change_pageserver")

-    # Put this tenant into a dual-attached state
-    assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0]
    alt_pageserver_id = env.pageservers[1].id
    env.pageservers[1].tenant_attach(env.initial_tenant)

@@ -94,7 +72,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
    env.pageservers[
        0
    ].stop()  # Stop the old pageserver just to make sure we're reading from the new one
-    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})

    execute("SELECT count(*) FROM foo")
    assert fetchone() == (100000,)
@@ -105,10 +82,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
    #
    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
    # about to start as the attached pageserver
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
    env.pageservers[0].start()
    env.pageservers[1].stop()
-    env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"})
-    env.storage_controller.reconcile_until_idle()

    endpoint.reconfigure(pageserver_id=env.pageservers[0].id)

@@ -116,9 +92,10 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
    assert fetchone() == (100000,)

    env.pageservers[0].stop()
+    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
+    # about to start as the attached pageserver
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
    env.pageservers[1].start()
-    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
-    env.storage_controller.reconcile_until_idle()

    # Test a (former) bug where a child process spins without updating its connection string
    # by executing a query separately. This query will hang until we issue the reconfigure.
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -21,10 +21,6 @@ from fixtures.utils import human_bytes, wait_until

 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"

-# access times in the pageserver are stored at a very low resolution: to generate meaningfully different
-# values, tests must inject sleeps
-ATIME_RESOLUTION = 2
-

@pytest.mark.parametrize("config_level_override", [None, 400])
 def test_min_resident_size_override_handling(
@@ -71,11 +67,14 @@ def test_min_resident_size_override_handling(

@enum.unique
 class EvictionOrder(str, enum.Enum):
+    ABSOLUTE_ORDER = "absolute"
    RELATIVE_ORDER_EQUAL = "relative_equal"
    RELATIVE_ORDER_SPARE = "relative_spare"

    def config(self) -> Dict[str, Any]:
-        if self == EvictionOrder.RELATIVE_ORDER_EQUAL:
+        if self == EvictionOrder.ABSOLUTE_ORDER:
+            return {"type": "AbsoluteAccessed"}
+        elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
            return {
                "type": "RelativeAccessed",
                "args": {"highest_layer_count_loses_first": False},
@@ -385,7 +384,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):

@pytest.mark.parametrize(
    "order",
-    [EvictionOrder.RELATIVE_ORDER_EQUAL],
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
 )
 def test_pageserver_evicts_until_pressure_is_relieved(
    eviction_env: EvictionEnv, order: EvictionOrder
@@ -419,7 +418,7 @@ def test_pageserver_evicts_until_pressure_is_relieved(

@pytest.mark.parametrize(
    "order",
-    [EvictionOrder.RELATIVE_ORDER_EQUAL],
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
 )
 def test_pageserver_respects_overridden_resident_size(
    eviction_env: EvictionEnv, order: EvictionOrder
@@ -496,7 +495,7 @@ def test_pageserver_respects_overridden_resident_size(

@pytest.mark.parametrize(
    "order",
-    [EvictionOrder.RELATIVE_ORDER_EQUAL],
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
 )
 def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
    """
@@ -527,6 +526,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
@pytest.mark.parametrize(
    "order",
    [
+        EvictionOrder.ABSOLUTE_ORDER,
        EvictionOrder.RELATIVE_ORDER_EQUAL,
        EvictionOrder.RELATIVE_ORDER_SPARE,
    ],
@@ -550,7 +550,6 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
    (tenant_id, timeline_id) = warm

    # make picked tenant more recently used than the other one
-    time.sleep(ATIME_RESOLUTION)
    env.warm_up_tenant(tenant_id)

    # Build up enough pressure to require evictions from both tenants,
@@ -573,38 +572,63 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
            later_tenant_usage < du_by_timeline[tenant]
        ), "all tenants should have lost some layers"

-    # with relative order what matters is the amount of layers, with a
-    # fudge factor of whether the eviction bothers tenants with highest
-    # layer count the most. last accessed times between tenants does not
-    # matter.
-    assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE]
-    layers_now = env.count_layers_per_tenant(env.pageserver)
+    warm_size = later_du_by_timeline[warm]
+    cold_size = later_du_by_timeline[cold]

-    expected_ratio = later_total_on_disk / total_on_disk
-    log.info(
-        f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
-    )
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # bounds for warmed_size
+        warm_lower = 0.5 * du_by_timeline[warm]

-    for tenant_id, original_count in tenant_layers.items():
-        count_now = layers_now[tenant_id]
-        ratio = count_now / original_count
-        abs_diff = abs(ratio - expected_ratio)
-        assert original_count > count_now
+        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+        # So, check for up to 3 here.
+        warm_upper = warm_lower + 3 * env.layer_size

-        expectation = 0.06
+        cold_upper = 2 * env.layer_size
+        log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
        log.info(
-            f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
+            f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
        )
-        # in this test case both relative_spare and relative_equal produce
-        # the same outcomes; this must be a quantization effect of similar
-        # sizes (-s4 and -s6) and small (5MB) layer size.
-        # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
-        assert abs_diff < expectation
+        log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+        assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+        assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
+        assert (
+            cold_size < cold_upper
+        ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+    else:
+        # with relative order what matters is the amount of layers, with a
+        # fudge factor of whether the eviction bothers tenants with highest
+        # layer count the most. last accessed times between tenants does not
+        # matter.
+        layers_now = env.count_layers_per_tenant(env.pageserver)
+
+        expected_ratio = later_total_on_disk / total_on_disk
+        log.info(
+            f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
+        )
+
+        for tenant_id, original_count in tenant_layers.items():
+            count_now = layers_now[tenant_id]
+            ratio = count_now / original_count
+            abs_diff = abs(ratio - expected_ratio)
+            assert original_count > count_now
+
+            expectation = 0.06
+            log.info(
+                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
+            )
+            # in this test case both relative_spare and relative_equal produce
+            # the same outcomes; this must be a quantization effect of similar
+            # sizes (-s4 and -s6) and small (5MB) layer size.
+            # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
+            assert abs_diff < expectation


@pytest.mark.parametrize(
    "order",
    [
+        EvictionOrder.ABSOLUTE_ORDER,
        EvictionOrder.RELATIVE_ORDER_EQUAL,
        EvictionOrder.RELATIVE_ORDER_SPARE,
    ],
@@ -627,10 +651,6 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
    for scale in [1, 1, 1, 4]:
        timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))

-        # Eviction times are stored at a low resolution.  We must ensure that the time between
-        # tenants is long enough for the pageserver to distinguish them.
-        time.sleep(ATIME_RESOLUTION)
-
    env.neon_cli.safekeeper_stop()

    for (tenant_id, timeline_id), scale in timelines:
@@ -660,7 +680,14 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
    ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
    log.info(f"{ratios}")

-    if order == EvictionOrder.RELATIVE_ORDER_EQUAL:
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # first tenant loses most
+        assert ratios[0] <= ratios[1], "first should lose the most"
+        assert ratios[1] < ratios[2], "second should lose some"
+        assert ratios[1] < 1.0
+        assert ratios[2] <= ratios[3], "third might not lose"
+        assert ratios[3] == 1.0, "tenant created last does not lose"
+    elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
        assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
    elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
        # with different layer sizes and pg versions, there are different combinations
@@ -723,7 +750,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
            "type": "Failure",
            "mocked_error": "EIO",
        },
-        eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
    )

    env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
@@ -757,7 +784,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
            # This avoids accounting for metadata files & tenant conf in the tests.
            "name_filter": ".*__.*",
        },
-        eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
    )

    wait_until(
@@ -810,7 +837,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
            # This avoids accounting for metadata files & tenant conf in the tests.
            "name_filter": ".*__.*",
        },
-        eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
    )

    wait_until(
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -89,7 +89,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
    )
    conn = endpoint.connect()
    cur = conn.cursor()
-    cur.execute("create extension neon")
+    cur.execute("create extension neon version '1.4'")
    cur.execute(
        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
    )
--- a/Show More
+++ b/Show More